In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as snb
import spacy

In [None]:
"we need", "you need", "we should", "you should", "we will", "you will", "we have to", "you have to"

"if you", "if we", "suppose we", "we can", "you can", 

"is that ok", "does it make sense", "got it", "are we clear", "any questions", "let me know", "anything else", "ask me", "ping me", "contact me"

"looking for", "assistance", "support", "focus"

"price point", "flexible payment", "discount", "discounts"

"send email", "send recording", "email recording", "send study plan", "email the resources", "send the resources"

"going to purchase", "going to buy", "will purchase", "will buy", "make the purchase"

"graduate", "graduated", "studied"

"working", "job"

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import cosine_similarity
import yake
import numpy as np
import json

In [2]:
# emotion_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
# emotion_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

emotion_tokenizer = AutoTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
emotion_model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
emotion_model.eval()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

keyword_extractor = yake.KeywordExtractor(top=8, stopwords=None)
# EMOTION_LABELS = ['anger', 'joy', 'optimism', 'sadness']
# EMOTION_LABELS = ['eager', 'joy', 'optimism', 'hope', 'anxiety', 'doubt', 'skepticism', 'excitement', 'fear', 'disappointment', 'interest', 
#                   'awkwardeness', 'confusion', 'confidence', 'relief', 'understanding', 'stress']
goemo = load_dataset("go_emotions")
EMOTION_LABELS = goemo['train'].features['labels'].feature.names

In [3]:
phase_prototypes = {
    "Introduction": ["hello", "welcome", "thanks for joining", "good morning",
                    "graduate", "graduated", "studied", "working", "work", "job", "B.Tech", "engineering", "M.Tech", "MS"],
    "Prospect’s performance": ["last attempt", "attempt", "your performance", "preparation", "percentile", "mock test",
                               "targets", "achievements", "target", "goal", "aiming"],
    "Agent drawing up plan": ["we’ll create a plan", "custom strategy", "roadmap", "approach",
                             "looking for", "assistance", "assist", "support", "focus"],
    "Explaining product": ["our product", "features include", "capabilities", "what it does",
                          "we need", "you need", "we should", "you should", "we will", "you will", "we have to", "you have to",
                          "if you", "if we", "suppose we", "we can", "you can",
                          "quant", "verbal", "insights", "DI", "score", "module", "grade"],
    "Price discussion": ["pricing", "cost", "discount", "offer", "package", 
                         "price point", "flexible payment", "discounts"],
    "Q&A": ["any questions", "feel free to ask", "clarify", "follow-up",
           "is that ok", "does it make sense", "got it", "are we clear", "let me know", "anything else", "ask me", "ping me", "contact me"],
    "Wrap": ["going to purchase", "going to buy", "will purchase", "will buy", "make the purchase", 
             "send email", "send recording", "email recording", "send study plan", "email the resources", "send the resources"]
}
prototype_texts = list(phase_prototypes.keys())
prototype_embeddings = embedding_model.encode([" ".join(v) for v in phase_prototypes.values()])

In [4]:
def predict_emotions(text):
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
        probs = torch.sigmoid(logits)[0]
    top_idxs = (probs > 0.3).nonzero(as_tuple=True)[0]  # threshold
    emotions = [(EMOTION_LABELS[i], probs[i].item()) for i in top_idxs]
    return emotions

def extract_keywords(text):
    return [kw for kw, _ in keyword_extractor.extract_keywords(text)]

## Data Preparation

In [5]:
df = pd.read_csv("transcripts.csv").dropna(subset=["text"])
df = df.sort_values(by=["transcript_id", "timestamp"])

In [6]:
df["text"] = df["text"].str.strip()

In [13]:
timestamp_ranges = df.groupby('transcript_id')['timestamp'].agg(['min', 'max'])
timestamp_ranges

Unnamed: 0_level_0,min,max
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100_20250213,00:00:02,01:17:17
101_20250212,00:00:02,00:58:49
102_20250212,00:00:04,00:58:29
103_20250212,00:00:03,01:06:40
104_20250212,00:00:05,00:41:18
...,...,...
96_20250211,00:00:02,01:03:27
97_20250212,00:00:03,00:53:29
98_20250212,00:00:02,00:24:49
99_20250211,00:00:02,00:40:38


In [20]:
SECTION_PERCENTAGES = {
    1: 0.05,   # 5% Introduction
    2: 0.15,   # 15% Prospect’s performance
    3: 0.20,   # 20% Agent drawing up plan
    4: 0.35,   # 35% Explaining product
    5: 0.10,   # 10% Price discussion
    6: 0.10,   # 10% Q&A
    7: 0.05    # 5% Wrap
}

assert sum(SECTION_PERCENTAGES.values()) == 1.0, "Percentages must sum to 100%"

def assign_sections(group):
    group = group.sort_values('timestamp')
    total_rows = len(group)
    
    cumulative_rows = 0
    group['section_number'] = np.nan
    
    for section, pct in SECTION_PERCENTAGES.items():
        start_row = cumulative_rows
        end_row = cumulative_rows + int(round(total_rows * pct))
        
        # Handle last section to include remaining rows (due to rounding)
        if section == 7:
            end_row = total_rows
        
        # section numbers assignment
        group.iloc[start_row:end_row, group.columns.get_loc('section_number')] = section
        cumulative_rows = end_row
    
    return group

In [22]:
df_sections = df.groupby('transcript_id', group_keys=False).apply(assign_sections)

section_ranges = (df_sections.groupby(['transcript_id', 'section_number'])
    .agg(start_timestamp=('timestamp', 'min'),end_timestamp=('timestamp', 'max'),row_count=('timestamp', 'count')).reset_index()
)
section_ranges

  df_sections = df.groupby('transcript_id', group_keys=False).apply(assign_sections)


Unnamed: 0,transcript_id,section_number,start_timestamp,end_timestamp,row_count
0,100_20250213,1.0,00:00:02,00:04:02,42
1,100_20250213,2.0,00:04:16,00:15:26,127
2,100_20250213,3.0,00:15:29,00:32:19,169
3,100_20250213,4.0,00:32:24,01:00:46,296
4,100_20250213,5.0,01:00:52,01:08:05,85
...,...,...,...,...,...
989,9_20250106,3.0,00:05:45,00:11:46,105
990,9_20250106,4.0,00:11:51,00:22:10,183
991,9_20250106,5.0,00:22:11,00:25:17,52
992,9_20250106,6.0,00:25:21,00:28:14,52


In [23]:
section_ranges.to_csv("sections_in_transcripts.csv", index=False)

In [26]:
df_sections

Unnamed: 0,month,transcript_id,timestamp,text,predicted_speaker,datetime,section_number
0,Feb,100_20250213,00:00:02,"Matt, so basically I have completed my B.Tech ...",prospect,2025-05-29 00:00:02,1.0
1,Feb,100_20250213,00:00:14,OK.,agent,2025-05-29 00:00:14,1.0
2,Feb,100_20250213,00:00:14,So after that I have joined PwC India and I ha...,prospect,2025-05-29 00:00:14,1.0
3,Feb,100_20250213,00:00:33,OK.,agent,2025-05-29 00:00:33,1.0
4,Feb,100_20250213,00:00:34,It was the reason I will tell you it was that ...,prospect,2025-05-29 00:00:34,1.0
...,...,...,...,...,...,...,...
79671,Jan,9_20250106,00:29:18,"Yeah, thank you.",prospect,2025-05-29 00:29:18,7.0
79672,Jan,9_20250106,00:29:20,Thank you.,prospect,2025-05-29 00:29:20,7.0
79673,Jan,9_20250106,00:29:20,Bye.,agent,2025-05-29 00:29:20,7.0
79674,Jan,9_20250106,00:29:20,Bye.,agent,2025-05-29 00:29:20,7.0


## Clustering and Emotion Detection

In [54]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")

EDUCATION_KEYWORDS = [
    "high school", "diploma", "bachelor", "bachelors", "b.sc", "b.sc.", "bcom", "b.com", "ba",
    "b.a", "b.e", "b.tech", "m.sc", "m.sc.", "mca", "m.tech", "m.com", "phd", "ph.d",
    "doctorate", "masters", "graduate", "postgraduate", "undergraduate", "engineering", 
    "commerce", "arts", "science", "computer science", "information technology"
]

def extract_education_from_text(text, keywords=EDUCATION_KEYWORDS):
    text = text.lower()
    matches = []
    for keyword in keywords:
        # Use word boundaries to avoid partial matches (e.g., "mba" in "embarrass")
        if re.search(r'\b' + re.escape(keyword) + r'\b', text):
            matches.append(keyword)
    return list(set(matches))

def extract_ner_entities(text):
    doc = nlp(text)
    entities = {
        "location": set(),
        "organization": set(),
        "person": set(),
        "age": set(),
        "date": set(),
        "education": []
    }

    entities['education'] = extract_education_from_text(text)
    
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            entities["location"].add(ent.text)
        elif ent.label_ == "ORG":
            entities["organization"].add(ent.text)
        elif ent.label_ == "PERSON":
            entities["person"].add(ent.text)
        elif ent.label_ == "DATE":
            if any(char.isdigit() for char in ent.text):
                entities["date"].add(ent.text)
        elif ent.label_ == "AGE":
            entities["age"].add(ent.text)

    return {k: list(v) for k, v in entities.items()}



In [61]:
results = []
sections_text = []
count = 0

for tid, group in df_sections.groupby("transcript_id"):
    print(tid)
    group = group.sort_values("timestamp").reset_index(drop=True)

    section_data = []
    section_text_data = []
    ner_info = {}

    for section_id in sorted(group['section_number'].unique()):
        phase = prototype_texts[int(section_id) - 1]
        chunk = group[group["section_number"] == section_id]
        chunk_texts = chunk["text"].tolist()
        chunk_speakers = chunk["predicted_speaker"].tolist()

        agent_texts = [txt for txt, spk in zip(chunk_texts, chunk_speakers) if spk == "agent"]
        prospect_texts = [txt for txt, spk in zip(chunk_texts, chunk_speakers) if spk == "prospect"]

        agent_blob = " ".join(agent_texts)
        prospect_blob = " ".join(prospect_texts)

        if int(section_id) == 1:
            ner_info = extract_ner_entities(prospect_blob)

        agent_emotions_predicted = predict_emotions(agent_blob)
        agent_emotions = [e[0] for e in agent_emotions_predicted]
        agent_scores = [e[1] for e in agent_emotions_predicted]

        prospect_emotions_predicted = predict_emotions(prospect_blob)
        prospect_emotions = [e[0] for e in prospect_emotions_predicted]
        prospect_scores = [e[1] for e in prospect_emotions_predicted]

        agent_keywords = extract_keywords(agent_blob)
        prospect_keywords = extract_keywords(prospect_blob)

        section_data.append({
            "section_number": int(section_id),
            "phase": phase,
            "speaker emotion": agent_emotions,
            "speaker emotion_score": agent_scores,
            "speaker keywords": ", ".join(agent_keywords),
            "speaker duration": len(agent_texts),
            "prospect emotion": prospect_emotions,
            "prospect emotion_score": prospect_scores,
            "prospect keywords": ", ".join(prospect_keywords),
            "prospect duration": len(prospect_texts),
            "start_timestamp": chunk["timestamp"].iloc[0],
            "end_timestamp": chunk["timestamp"].iloc[-1]
        })

        section_text_data.append({
            "section_number": int(section_id),
            "phase": phase,
            "agent_text": agent_blob,
            "prospect_text": prospect_blob
        })

    results.append({
        "transcript_id": tid,
        "month": group['month'].iloc[0],
        "sections": section_data
    })
    for ner,val in ner_info.items():
        results[count][ner] = val
    count += 1

    sections_text.append({
        "transcript_id": tid,
        "sections": section_text_data,
    })


100_20250213
101_20250212
102_20250212
103_20250212
104_20250212
105_20250215
106_20250215
107_20250215
108_20250216
109_20250216
10_20250107
110_20250216
111_20250216
112_20250213
113_20250217
114_20250214
115_20250214
116_20250217
117_20250217
118_20250217
119_20250217
11_20250107
120_20250218
121_20250218
122_20250219
123_20250219
124_20250219
125_20250221
126_20250221
127_20250222
128_20250223
129_20250223
12_20250107
130_20250219
131_20250224
132_20250224
133_20250225
134_20250226
135_20250226
136_20250226
137_20250227
138_20250226
139_20250227
13_20250108
140_20250227
141_20250228
142_20250224
14_20250110
15_20250110
16_20250110
17_20250110
18_20250111
19_20250112
1_20250101
20_20250112
21_20250112
22_20250112
23_20250112
24_20250111
25_20250110
26_20250113
27_20250110
28_20250113
29_20250113
2_20250102
30_20250114
31_20250113
32_20250113
33_20250114
34_20250114
35_20250115
36_20250114
37_20250115
38_20250115
39_20250116
3_20250102
40_20250116
41_20250117
42_20250119
43_20250119


In [62]:
with open("transcript_analysis_sections.json", "w") as f:
    json.dump(results, f, indent=2)

with open("transcript_sections_texts.json", "w") as f:
    json.dump(sections_text, f, indent=2)