In [None]:
import numpy
import json

In [None]:
with open("transcripts.json", "r") as file:
	data = json.load(file)

In [None]:
len(data)

In [None]:
data[0].keys()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# 1. Analyze fields present in all data
total_records = len(data)
field_counts = Counter()
all_keys = set()

for entry in data:
    keys = entry.keys()
    field_counts.update(keys)
    all_keys.update(keys)

present_in_all = [key for key, count in field_counts.items() if count == total_records]
missing_in_some = [key for key in all_keys if key not in present_in_all]

print(f"Total records: {total_records}")
print(f"Fields present in all records: {present_in_all}")
print(f"Fields missing in some records: {missing_in_some}")

# 2. Analyze conversation lengths
# Assuming 'conversation' is the key for the dialogue list
conversation_lengths = []
for entry in data:
    if 'conversation' in entry:
        conversation_lengths.append(len(entry['conversation']))

# Basic statistics
if conversation_lengths:
    avg_len = sum(conversation_lengths) / len(conversation_lengths)
    max_len = max(conversation_lengths)
    min_len = min(conversation_lengths)
    
    print(f"\nConversation Length Statistics (number of turns):")
    print(f"Average Length: {avg_len:.2f}")
    print(f"Max Length: {max_len}")
    print(f"Min Length: {min_len}")

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.hist(conversation_lengths, bins=30, edgecolor='black')
    plt.title('Distribution of Conversation Lengths')
    plt.xlabel('Number of Turns')
    plt.ylabel('Frequency')
    plt.show()
else:
    print("\n'conversation' field not found or empty in the data.")

In [None]:
for i in range(10):
    print(f"\n-------Sample conversation {i+1}--------")
    print(f'domain: {data[i].get("domain", "N/A")}')
    print(f'intent: {data[i].get("intent", "N/A")}')
    print(f'reason_for_call: {data[i].get("reason_for_call", "N/A")}')

In [None]:
# Frequency distribution for 'domain'
domain_counts = pd.Series([entry.get('domain', 'N/A') for entry in data]).value_counts()
print("Domain Frequency Distribution:")
print(domain_counts)

plt.figure(figsize=(10, 5))
domain_counts.plot(kind='bar')
plt.title('Frequency Distribution of Domain')
plt.xlabel('Domain')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Frequency distribution for 'intent'
intent_counts = pd.Series([entry.get('intent', 'N/A') for entry in data]).value_counts()
print("\nIntent Frequency Distribution:")
print(intent_counts)

plt.figure(figsize=(10, 5))
intent_counts.plot(kind='bar')
plt.title('Frequency Distribution of Intent')
plt.xlabel('Intent')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
intent_counts.size

In [None]:
# Analyze text lengths in conversation turns to find buggy data
import pandas as pd
import matplotlib.pyplot as plt

all_text_lengths = []
transcript_text_lengths = {} # transcript_id -> list of lengths

for entry in data:
    t_id = entry.get('transcript_id', 'unknown')
    if 'conversation' in entry:
        lengths = []
        for turn in entry['conversation']:
            if 'text' in turn:
                l = len(turn['text'])
                all_text_lengths.append(l)
                lengths.append(l)
        transcript_text_lengths[t_id] = lengths

# Convert to series for stats
text_len_series = pd.Series(all_text_lengths)
print("Text Length Statistics:")
print(text_len_series.describe())

# Define threshold for "abnormally long"
# Using 99.9th percentile to catch extreme outliers which might be buggy
threshold = text_len_series.quantile(0.999)
print(f"\nThreshold for abnormal length (99.9th percentile): {threshold:.2f}")

# Find transcripts with abnormal text lengths
flagged_transcripts = []
for t_id, lengths in transcript_text_lengths.items():
    max_len = max(lengths) if lengths else 0
    if max_len > threshold:
        flagged_transcripts.append({
            'transcript_id': t_id,
            'max_text_length': max_len
        })

print(f"\nFound {len(flagged_transcripts)} transcripts with abnormally long text fields (> {threshold:.2f} chars).")

# Sort by max length to see the worst offenders
flagged_transcripts.sort(key=lambda x: x['max_text_length'], reverse=True)

print("\nTop 10 Flagged Transcripts:")
for item in flagged_transcripts[:10]:
    print(f"ID: {item['transcript_id']}, Max Length: {item['max_text_length']}")

# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(all_text_lengths, bins=50, log=True, edgecolor='black')
plt.title('Distribution of Text Lengths in Conversation Turns (Log Scale)')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.axvline(threshold, color='r', linestyle='dashed', linewidth=1, label=f'Threshold ({threshold:.0f})')
plt.legend()
plt.show()

In [None]:
# Filter transcripts with text length > 300
filtered_data = []
removed_ids = []
threshold = 300

for entry in data:
    t_id = entry.get('transcript_id', 'unknown')
    should_remove = False
    if 'conversation' in entry:
        for turn in entry['conversation']:
            if 'text' in turn and len(turn['text']) > threshold:
                should_remove = True
                break
    
    if should_remove:
        removed_ids.append(t_id)
    else:
        filtered_data.append(entry)

# Save to a new file
output_filename = "transcripts_filtered.json"
with open(output_filename, "w") as f:
    json.dump(filtered_data, f, indent=2)

print(f"Original count: {len(data)}")
print(f"Filtered count: {len(filtered_data)}")
print(f"Removed {len(removed_ids)} transcripts.")
print("Removed IDs:")
print(removed_ids)

In [None]:
# Analyze text length on filtered version
import pandas as pd
import matplotlib.pyplot as plt

with open("transcripts_filtered.json", "r") as file:
	data_filtered = json.load(file)
    
all_text_lengths = []
transcript_text_lengths = {} # transcript_id -> list of lengths

for entry in data_filtered:
    t_id = entry.get('transcript_id', 'unknown')
    if 'conversation' in entry:
        lengths = []
        for turn in entry['conversation']:
            if 'text' in turn:
                l = len(turn['text'])
                all_text_lengths.append(l)
                lengths.append(l)
        transcript_text_lengths[t_id] = lengths

# Convert to series for stats
text_len_series = pd.Series(all_text_lengths)
print("Text Length Statistics:")
print(text_len_series.describe())

# Define threshold for "abnormally long"
# Using 99.9th percentile to catch extreme outliers which might be buggy
threshold = text_len_series.quantile(0.999)
print(f"\nThreshold for abnormal length (99.9th percentile): {threshold:.2f}")

# Find transcripts with abnormal text lengths
flagged_transcripts = []
for t_id, lengths in transcript_text_lengths.items():
    max_len = max(lengths) if lengths else 0
    if max_len > threshold:
        flagged_transcripts.append({
            'transcript_id': t_id,
            'max_text_length': max_len
        })

print(f"\nFound {len(flagged_transcripts)} transcripts with abnormally long text fields (> {threshold:.2f} chars).")

# Sort by max length to see the worst offenders
flagged_transcripts.sort(key=lambda x: x['max_text_length'], reverse=True)

print("\nTop 10 Flagged Transcripts:")
for item in flagged_transcripts[:10]:
    print(f"ID: {item['transcript_id']}, Max Length: {item['max_text_length']}")

# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(all_text_lengths, bins=50, log=True, edgecolor='black')
plt.title('Distribution of Text Lengths in Conversation Turns (Log Scale)')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.axvline(threshold, color='r', linestyle='dashed', linewidth=1, label=f'Threshold ({threshold:.0f})')
plt.legend()
plt.show()

In [None]:
import json
with open("clean_transcripts.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [None]:
len(data)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

all_text_lengths = []
transcript_text_lengths = {} # transcript_id -> list of lengths

for entry in data:
    t_id = entry.get('transcript_id', 'unknown')
    if 'conversation' in entry:
        lengths = []
        for turn in entry['conversation']:
            if 'text' in turn:
                l = len(turn['text'])
                all_text_lengths.append(l)
                lengths.append(l)
        transcript_text_lengths[t_id] = lengths

# Convert to series for stats
text_len_series = pd.Series(all_text_lengths)
print("Text Length Statistics:")
print(text_len_series.describe())

# Define threshold for "abnormally long"
# Using 99.9th percentile to catch extreme outliers which might be buggy
threshold = text_len_series.quantile(0.999)
print(f"\nThreshold for abnormal length (99.9th percentile): {threshold:.2f}")

# Find transcripts with abnormal text lengths
flagged_transcripts = []
for t_id, lengths in transcript_text_lengths.items():
    max_len = max(lengths) if lengths else 0
    if max_len > threshold:
        flagged_transcripts.append({
            'transcript_id': t_id,
            'max_text_length': max_len
        })

print(f"\nFound {len(flagged_transcripts)} transcripts with abnormally long text fields (> {threshold:.2f} chars).")

# Sort by max length to see the worst offenders
flagged_transcripts.sort(key=lambda x: x['max_text_length'], reverse=True)

print("\nTop 10 Flagged Transcripts:")
for item in flagged_transcripts[:10]:
    print(f"ID: {item['transcript_id']}, Max Length: {item['max_text_length']}")

In [None]:
merged_count = 0
for transcript in data:
    if 'conversation' not in transcript or not transcript['conversation']:
        continue

    # Create a new list to hold the merged turns
    new_conversation = []
    
    # Add the first turn to the new list
    # We use .copy() to ensure we are creating a new dictionary object for the merged list
    if len(transcript['conversation']) > 0:
        new_conversation.append(transcript['conversation'][0].copy())

    # Iterate through the rest of the turns
    for turn in transcript['conversation'][1:]:
        # Get the last turn added to the new list
        last_turn = new_conversation[-1]
        
        # Check if the current turn's speaker is the same as the last turn's speaker
        if turn['speaker'] == last_turn['speaker']:
            # If same speaker, merge the text into the last turn
            last_turn['text'] += ". " + turn['text']
            merged_count += 1
            # We do NOT append 'turn' to new_conversation, effectively deleting it
        else:
            # If different speaker, add the turn to the new list
            new_conversation.append(turn.copy())
    
    # Update the transcript with the merged conversation
    transcript['conversation'] = new_conversation

print(f"Merged {merged_count} turns where consecutive speakers were the same.")

# Save the modified data to a new JSON file
output_file = "clean_transcripts_merged.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

print(f"Modified data saved to {output_file}")

In [None]:
import json
with open("final_clean_transcripts.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [None]:
for transcript in data:
    prev_speaker = None
    for i,turn in enumerate(transcript['conversation']):
        if prev_speaker == None:
            prev_speaker = turn['speaker']
            continue
        if turn["speaker"] == prev_speaker:
            print(f"Transcript ID: {transcript['transcript_id']} | Turn Index: {i} | Speaker: {turn['speaker']}")
        prev_speaker = turn['speaker']

In [1]:
import json

In [4]:
with open("final_clean_transcripts.json", "r", encoding="utf-8") as file:
    data = json.load(file)