# Annotate Transcript train data with speaker for each conversation block (random 8 transcripts annotated)

## Check if annotation and content is missing

In [5]:
import os
import re

def parse_txt_file(content):
    """Parses file into list of (timestamp, text)"""
    blocks = content.strip().split("\n\n")
    parsed = []
    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 2:
            timestamp = lines[0].strip()
            text = lines[1].strip()
            parsed.append((timestamp, text))
    return parsed

def parse_annot_file(content):
    """Parses annotation file into dict {timestamp: (text, speaker or None)}"""
    blocks = content.strip().split("\n\n")
    parsed = {}
    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 2:
            timestamp = lines[0].strip()
            text = lines[1].strip()
            speaker = lines[2].strip() if len(lines) > 2 and lines[2].startswith("Speaker:") else None
            parsed[timestamp] = (text, speaker)
    return parsed

def build_synced_blocks(txt_data, annot_dict):
    result = []
    for timestamp, text in txt_data:
        if timestamp not in annot_dict:
            # Case 1: Timestamp missing in annotation
            block = f"{timestamp}\n{text}\nSpeaker:"
        else:
            annot_text, speaker = annot_dict[timestamp]
            if annot_text != text:
                # Case 2: Text mismatch
                block = f"{timestamp}\n{text}\n{speaker if speaker else 'Speaker:'}"
            else:
                # Case 3: Text matches
                block = f"{timestamp}\n{text}\n{speaker if speaker else 'Speaker:'}"
        result.append(block)
    return "\n\n".join(result)

def process_folder(folder_path):
    files = os.listdir(folder_path)
    txt_files = [f for f in files if re.match(r"\d+\.txt$", f)]

    for txt_file in txt_files:
        base_name = txt_file[:-4]
        annot_file = f"{base_name}_annot.txt"
        txt_path = os.path.join(folder_path, txt_file)
        annot_path = os.path.join(folder_path, annot_file)

        if not os.path.exists(annot_path):
            print(f"Annotation file not found for: {txt_file}")
            continue

        # Read both files
        with open(txt_path, "r", encoding="utf-8") as f:
            txt_content = f.read()
        with open(annot_path, "r", encoding="utf-8") as f:
            annot_content = f.read()

        # Parse
        txt_data = parse_txt_file(txt_content)
        annot_dict = parse_annot_file(annot_content)

        # Build final content
        synced_output = build_synced_blocks(txt_data, annot_dict)

        # Write to a new file
        output_path = os.path.join(folder_path, f"{base_name}_annot_rebuilt.txt")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(synced_output)
        print(f"Created: {output_path}")

process_folder(os.path.join(os.getcwd().split('e-GMAT_SalesCall_DataAnalysis')[0],"e-GMAT_SalesCall_Transcripts","annotated"))


Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\1_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\105_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\22_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\47_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\63_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\71_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\8_annot_rebuilt.txt
Created: C:\GOWDATA\PROJECTS\e-GMAT-SalesCall-Analysis\e-GMAT_SalesCall_Transcripts\annotated\88_annot_rebuilt.txt


## Make annotated transcripts CSV ready for training

In [6]:
import os
import re
import csv

def parse_rebuilt_file(content, transcript_id):
    """Parses one rebuilt file and returns list of rows with transcript ID."""
    blocks = content.strip().split("\n\n")
    parsed_rows = []

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 2:
            timestamp = lines[0].strip()
            text = lines[1].strip()
            speaker = ""
            if len(lines) > 2 and lines[2].startswith("Speaker:"):
                speaker = lines[2].replace("Speaker:", "").strip()
            parsed_rows.append((transcript_id, timestamp, text, speaker))
    
    return parsed_rows

def combine_all_to_csv(folder_path, output_csv_path):
    files = os.listdir(folder_path)
    rebuilt_files = [f for f in files if f.endswith("_annot_rebuilt.txt")]

    all_rows = []

    for filename in rebuilt_files:
        match = re.match(r"(\d+)_annot_rebuilt\.txt", filename)
        if not match:
            continue
        transcript_id = match.group(1)

        input_path = os.path.join(folder_path, filename)
        with open(input_path, "r", encoding="utf-8") as f:
            content = f.read()

        rows = parse_rebuilt_file(content, transcript_id)
        all_rows.extend(rows)

    # Write combined CSV
    with open(output_csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["transcript_id", "timestamp", "text", "speaker"])
        writer.writerows(all_rows)

    print(f"Combined CSV written to: {output_csv_path}")

combine_all_to_csv(os.path.join(os.getcwd().split('e-GMAT_SalesCall_DataAnalysis')[0],"e-GMAT_SalesCall_Transcripts","annotated")), "transcripts_annot.csv")


Combined CSV written to: transcripts_annot.csv


## Pre process CSV file

In [42]:
data = pd.read_csv("transcripts_annot.csv")
data.tail()

Unnamed: 0,transcript_id,timestamp,text,speaker
3955,8,59:11:00,Thank you so much.,prospect
3956,8,59:12:00,Bye.,agent
3957,8,59:12:00,Bye.,agent
3958,8,59:12:00,"Yeah, bye.",agent
3959,8,59:14:00,Bye.,prospect


In [32]:
data['speaker'].isna().sum()

0

In [43]:
def format_speaker(speaker):
    return speaker.lower()

data['speaker'] = data['speaker'].apply(format_speaker)

In [44]:
data['speaker'].value_counts()

speaker
agent       3240
prospect     707
unknown       13
Name: count, dtype: int64

In [46]:
data.to_csv("transcripts_annot_classify.csv")

## Speaker Classification pipeline

In [33]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import random

MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LABEL_MAP = {"agent": 0, "prospect": 1, "unknown": 2}
NUM_LABELS = len(LABEL_MAP)
DEVICE = torch.device("cpu")

In [88]:
df = pd.read_csv("transcripts_annot_classify.csv")

df["speaker"] = df["speaker"].fillna("Unknown")
df["label"] = df["speaker"].map(LABEL_MAP)

In [48]:
df

Unnamed: 0.1,Unnamed: 0,transcript_id,timestamp,text,speaker,label
327,327,1,00:44,"Hey Nikita, yes, hey, Hi Amruth, hey, hi again.",agent,0
328,328,1,00:50,Happy New year And I assume that the set of co...,agent,0
329,329,1,00:59,Its the same for you as well.,agent,0
330,330,1,01:00,Nikita.,agent,0
965,965,1,01:00:02,You are going to have that connect with your m...,agent,0
...,...,...,...,...,...,...
322,322,105,43:42:00,"All right, take care then.",agent,0
323,323,105,43:45:00,See you.,agent,0
324,324,105,43:46:00,Thank you.,prospect,1
325,325,105,43:47:00,Bye.,agent,0


In [55]:
grouped = df.sort_values(by=["transcript_id", "timestamp"]).groupby("transcript_id")
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025B0D7CDA50>

In [59]:
dialogues = []
for _, group in grouped:
    utterances = []
    for _, row in group.iterrows():
        utterances.append((row["text"], row["label"]))
    dialogues.append(utterances)
dialogues[0]

[('Hey Nikita, yes, hey, Hi Amruth, hey, hi again.', 0),
 ('Happy New year And I assume that the set of concerns which and Nivesh has you know mentioned that that we need to address in the call.',
  0),
 ('Its the same for you as well.', 0),
 ('Nikita.', 0),
 ('You are going to have that connect with your mentor in detail to actually understand or or to strategise and give you that corrective factors in terms of how to improve your time in those particular questions and what is exactly so like the way if you know.',
  0),
 ('We have a cricket team of analysts, right, For every, every top cricket teams.',
  0),
 ("Now we have analysts coming into play to help the players understand what's his strength, what's his weakness and how to how to navigate things, how to improve with the help of coach.",
  0),
 ('So mentors are basically a coach with a high analytic support or analytics behind it to give you that instructive feedbacks.',
  0),
 ('OK.', 1),
 ('And so we will be collecting Yeah, 

In [60]:
split = int(0.8 * len(dialogues))
train_dialogues = dialogues[:split]
test_dialogues = dialogues[split:]

In [61]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_utterance(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [62]:
class TranscriptDataset(Dataset):
    def __init__(self, dialogues):
        self.samples = []
        for dialogue in dialogues:
            for text, label in dialogue:
                self.samples.append((text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]
        tokens = tokenize_utterance(text)
        return {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'label': torch.tensor(label)
        }

In [63]:
train_dataset = TranscriptDataset(train_dialogues)
test_dataset = TranscriptDataset(test_dialogues)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [64]:
class TranscriptClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, NUM_LABELS)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        logits = self.classifier(cls_output)
        return logits

model = TranscriptClassifier().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [65]:
def train():
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(train_loader)

In [66]:
def evaluate():
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    report = classification_report(all_labels, all_preds, target_names=LABEL_MAP.keys())
    print(report)

In [67]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_loss = train()
    print(f"Train Loss: {train_loss:.4f}")
    evaluate()


Epoch 1/3


100%|████████████████████████████████████████████████████████████████████████████████| 214/214 [24:56<00:00,  6.99s/it]


Train Loss: 0.3822


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       agent       0.83      0.97      0.89       436
    prospect       0.46      0.13      0.20        95
     unknown       0.00      0.00      0.00        10

    accuracy                           0.81       541
   macro avg       0.43      0.37      0.36       541
weighted avg       0.75      0.81      0.76       541


Epoch 2/3


100%|████████████████████████████████████████████████████████████████████████████████| 214/214 [23:36<00:00,  6.62s/it]


Train Loss: 0.2828


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       agent       0.84      0.97      0.90       436
    prospect       0.57      0.24      0.34        95
     unknown       0.00      0.00      0.00        10

    accuracy                           0.82       541
   macro avg       0.47      0.40      0.41       541
weighted avg       0.78      0.82      0.79       541


Epoch 3/3


100%|████████████████████████████████████████████████████████████████████████████████| 214/214 [24:45<00:00,  6.94s/it]


Train Loss: 0.2073
              precision    recall  f1-score   support

       agent       0.89      0.95      0.92       436
    prospect       0.67      0.51      0.57        95
     unknown       0.00      0.00      0.00        10

    accuracy                           0.86       541
   macro avg       0.52      0.49      0.50       541
weighted avg       0.83      0.86      0.84       541



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
torch.save(model.state_dict(), "speaker_model.pth")

## Populate speaker class across transcripts with classification

In [97]:
import os
import re

def parse_txt_file(content, transcript_id, month):
    """returns list of rows"""
    blocks = content.strip().split("\n\n")
    parsed_rows = []

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 2:
            timestamp = lines[0].strip()
            ts = timestamp.split(':')
            if len(ts) < 3:
                ts.insert(0,'00')
            for i in range(len(ts)):
                if len(ts[i]) < 2:
                    ts[i] = '0' + ts[i]
            timestamp = ":".join(ts)
            
            text = lines[1].strip()
            parsed_rows.append((month, transcript_id, timestamp, text))
    
    return parsed_rows

def combine_all_to_csv(folder_path, output_csv_path):
    all_rows = []
    for dir in os.listdir(folder_path):
        month = dir.split(" ")[0]
        print(dir)
        files = os.listdir(os.path.join(folder_path, dir))

        for filename in files:
            transcript_id = filename.split(".txt")[0]
            print(transcript_id)
            
            input_path = os.path.join(folder_path, dir, filename)
            with open(input_path, "r", encoding="utf-8") as f:
                content = f.read()
    
            rows = parse_txt_file(content, transcript_id, month)
            all_rows.extend(rows)

    # Write combined CSV
    with open(output_csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["month", "transcript_id", "timestamp", "text"])
        writer.writerows(all_rows)

    print(f"Combined CSV written to: {output_csv_path}")

combine_all_to_csv(os.path.join(os.getcwd().split('e-GMAT_SalesCall_DataAnalysis')[0],"e-GMAT_SalesCall_Transcripts","transcripts")), "transcripts.csv")

Feb Files
100_20250213
101_20250212
102_20250212
103_20250212
104_20250212
105_20250215
106_20250215
107_20250215
108_20250216
109_20250216
110_20250216
111_20250216
112_20250213
113_20250217
114_20250214
115_20250214
116_20250217
117_20250217
118_20250217
119_20250217
120_20250218
121_20250218
122_20250219
123_20250219
124_20250219
125_20250221
126_20250221
127_20250222
128_20250223
129_20250223
130_20250219
131_20250224
132_20250224
133_20250225
134_20250226
135_20250226
136_20250226
137_20250227
138_20250226
139_20250227
140_20250227
141_20250228
142_20250224
74_20250202
75_20250202
76_20250202
77_20250203
78_20250204
79_20250204
80_20250204
81_20250205
82_20250205
83_20250205
84_20250206
85_20250206
86_20250207
87_20250207
88_20250207
89_20250208
90_20250208
91_20250208
92_20250208
93_20250210
94_20250210
95_20250211
96_20250211
97_20250212
98_20250212
99_20250211
Jan Files
10_20250107
11_20250107
12_20250107
13_20250108
14_20250110
15_20250110
16_20250110
17_20250110
18_20250111
1

In [77]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn

DEVICE = torch.device("cpu")
LABEL_MAP = {0: "agent", 1: "prospect", 2: "unknown"}
MAX_LEN = 128

In [79]:
class TranscriptClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        logits = self.classifier(cls_output)
        return logits

# Load model
model = TranscriptClassifier().to(DEVICE)
model.load_state_dict(torch.load("speaker_model.pth"))
model.eval()

TranscriptClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [98]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
df = pd.read_csv("transcripts.csv")

df = df.sort_values(by=["transcript_id", "timestamp"])

In [81]:
predictions = []

with torch.no_grad():
    for _, row in df.iterrows():
        text = row["text"]
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=MAX_LEN)
        input_ids = tokens["input_ids"].to(DEVICE)
        attention_mask = tokens["attention_mask"].to(DEVICE)

        outputs = model(input_ids, attention_mask)
        pred = torch.argmax(outputs, dim=1).item()
        predictions.append(LABEL_MAP[pred])

In [99]:
df["predicted_speaker"] = predictions
df

Unnamed: 0,month,transcript_id,timestamp,text,predicted_speaker
0,Feb,100_20250213,00:00:02,"Matt, so basically I have completed my B.Tech ...",prospect
1,Feb,100_20250213,00:00:14,OK.,agent
2,Feb,100_20250213,00:00:14,So after that I have joined PwC India and I ha...,prospect
3,Feb,100_20250213,00:00:33,OK.,agent
4,Feb,100_20250213,00:00:34,It was the reason I will tell you it was that ...,prospect
...,...,...,...,...,...
79671,Jan,9_20250106,00:29:18,"Yeah, thank you.",prospect
79672,Jan,9_20250106,00:29:20,Thank you.,prospect
79673,Jan,9_20250106,00:29:20,Bye.,agent
79674,Jan,9_20250106,00:29:20,Bye.,agent


In [100]:
df.to_csv("transcripts.csv", index=False)