In [1]:
import json
import pandas as pd

In [2]:
with open("Conversational_Transcript_Dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [5]:
len(data["transcripts"])

5037

In [7]:
count:int=0
for chart in data["transcripts"]:
    count += len(chart.get("conversation"))

In [8]:
count

84465

In [11]:
data["transcripts"][0].keys()

dict_keys(['transcript_id', 'time_of_interaction', 'domain', 'intent', 'reason_for_call', 'conversation'])

In [21]:
unique_domain = set()
for chart in data["transcripts"]:
    unique_domain.add(chart.get("domain"))

In [20]:
len(unique_domain)

7

In [22]:
unique_domain

{'Banking & Finance',
 'E-commerce & Retail',
 'Healthcare Services',
 'Insurance',
 'Technology Support',
 'Telecommunications',
 'Travel & Hospitality'}

In [23]:
unique_intent = set()
for chart in data["transcripts"]:
    unique_intent.add(chart.get("intent"))

In [24]:
unique_intent

{'Account Access Issues',
 'Appointment Scheduling',
 'Business Event - Cyber Attack',
 'Business Event - Data Breach Response',
 'Business Event - Major Policy Changes',
 'Business Event - Network Outage',
 'Business Event - Policy Changes',
 'Business Event - Product Recall',
 'Business Event - Ransomware Attack',
 'Business Event - System Conversion Failure',
 'Business Event - System Outage',
 'Business Event - Warehouse Fire',
 'Claim Denials',
 'Delivery Investigation',
 'Escalation - Medical Error Complaint',
 'Escalation - Repeated Service Failures',
 'Escalation - Service Cancellation Threat',
 'Escalation - Threat of Legal Action',
 'Escalation - Unauthorized Account Closure',
 'Fraud Alert Investigation',
 'Multiple Issues - Appointment, Prescription & Insurance',
 'Multiple Issues - Billing & Payment Setup',
 'Multiple Issues - Billing, Plan Changes & Equipment',
 'Multiple Issues - Claim, Coverage & Policy',
 'Multiple Issues - Claims, Coverage & Policy Updates',
 'Multipl

In [25]:
transcripts = data['transcripts']

In [27]:
transcripts

[{'transcript_id': '6794-8660-4606-3216',
  'time_of_interaction': '2025-10-03 20:22:00',
  'domain': 'E-commerce & Retail',
  'intent': 'Delivery Investigation',
  'reason_for_call': 'Customer James Bailey reported a smart watch showing as delivered but never received, requiring delivery investigation and replacement shipment.',
  'conversation': [{'speaker': 'Agent',
    'text': 'Hello, thank you for contacting BuyNow. This is Emma. How can I help you?'},
   {'speaker': 'Customer',
    'text': "Hello, I'm calling about an order that shows delivered but I never received it."},
   {'speaker': 'Agent',
    'text': "I'm sorry to hear that. I'll definitely help you look into this. Can I get your order number?"},
   {'speaker': 'Customer',
    'text': "It's 9595912. The tracking was marked delivered yesterday afternoon, but there was nothing at my door."},
   {'speaker': 'Agent',
    'text': 'Let me pull that up right away. Okay, I see the order for a smart watch. The tracking does show it

In [39]:
ESCALATION_KEYWORDS = (
    "Escalation",
    "Business Event",
    "Ransomware",
    "Cyber Attack",
    "Data Breach",
    "System Outage",
    "Network Outage",
    "Fraud"
)


conversations_overview = []
for transcript in transcripts:
    intent = transcript.get('intent', '')
    short_intent = (next
        (
            (keyword for keyword in ESCALATION_KEYWORDS if keyword in intent),
            intent
        ))

    conversations_overview.append({
        'transcript_id': transcript['transcript_id'],
        'time_of_interaction': transcript['time_of_interaction'],
        'domain': transcript['domain'],
        'intent': transcript['intent'],
        'reason_for_call': transcript['reason_for_call'],
        'num_turns': len(transcript['conversation']),
        'short_intent': short_intent
    })


df = pd.DataFrame(conversations_overview)

df.to_csv(
    "conversations_overview.csv",
    index=False,
    encoding="utf-8-sig"
)

print("CSV saved as conversations_overview.csv")

CSV saved as conversations_overview.csv


In [37]:
conversations_overview

[{'transcript_id': '6794-8660-4606-3216',
  'time_of_interaction': '2025-10-03 20:22:00',
  'domain': 'E-commerce & Retail',
  'intent': 'Delivery Investigation',
  'reason_for_call': 'Customer James Bailey reported a smart watch showing as delivered but never received, requiring delivery investigation and replacement shipment.',
  'num_turns': 15,
  'short_intent': 'Delivery Investigation'},
 {'transcript_id': '7034-5430-2980-5483',
  'time_of_interaction': '2025-09-17 13:33:00',
  'domain': 'Healthcare Services',
  'intent': 'Escalation - Repeated Service Failures',
  'reason_for_call': 'Customer Jerry Chavez escalated to supervisor after experiencing multiple failed mobile app login and poor service recovery attempts over three weeks.',
  'num_turns': 17,
  'short_intent': 'Escalation'},
 {'transcript_id': '1846-5500-2990-8975',
  'time_of_interaction': '2025-04-24 15:31:00',
  'domain': 'Insurance',
  'intent': 'Fraud Alert Investigation',
  'reason_for_call': 'Customer Kyle Davis 

In [38]:
file_name = "conversations_overview.csv"

In [40]:
ESCALATION_KEYWORDS = (
    "Escalation",
    "Business Event",
    "Ransomware",
    "Cyber Attack",
    "Data Breach",
    "System Outage",
    "Network Outage",
    "Fraud"
)


utterances = []
for transcript in transcripts:
    intent = transcript.get('intent', '')
    short_intent = (next
        (
            (keyword for keyword in ESCALATION_KEYWORDS if keyword in intent),
            intent
        ))

    for turn_no, conversation in enumerate(transcript['conversation'], start=1):
        utterances.append({
        'transcript_id': transcript['transcript_id'],
        'domain': transcript['domain'],
        'intent': transcript['intent'],
        'reason_for_call': transcript['reason_for_call'],
        'short_intent': short_intent,
        'speaker': conversation.get('speaker'),
        'text': conversation.get('text'),
        'turn_no': turn_no
    })

In [43]:
df = pd.DataFrame(utterances)

df.to_csv(
    "utterances.csv",
    index=True,
    encoding="utf-8-sig"
)

print("CSV saved as utterances.csv")

CSV saved as utterances.csv


In [25]:
import pandas as pd
data = pd.read_csv("utterances.csv")

In [33]:
data.keys()

Index(['Unnamed: 0', 'transcript_id', 'domain', 'intent', 'reason_for_call',
       'short_intent', 'speaker', 'text', 'turn_no'],
      dtype='object')

In [None]:
data = pd.read_csv("utterances.csv")
data.columns = data.columns.str.lower()

for col in data.select_dtypes(include="object"):
    data[col] = data[col].str.lower()

data.drop(columns=['unnamed: 0'], errors='ignore', inplace=True)
data

In [36]:
data.duplicated().any()


np.False_

In [37]:
data.head()

Unnamed: 0,transcript_id,domain,intent,reason_for_call,short_intent,speaker,text,turn_no
0,6794-8660-4606-3216,e-commerce & retail,delivery investigation,customer james bailey reported a smart watch s...,delivery investigation,agent,"hello, thank you for contacting buynow. this i...",1
1,6794-8660-4606-3216,e-commerce & retail,delivery investigation,customer james bailey reported a smart watch s...,delivery investigation,customer,"hello, i'm calling about an order that shows d...",2
2,6794-8660-4606-3216,e-commerce & retail,delivery investigation,customer james bailey reported a smart watch s...,delivery investigation,agent,i'm sorry to hear that. i'll definitely help y...,3
3,6794-8660-4606-3216,e-commerce & retail,delivery investigation,customer james bailey reported a smart watch s...,delivery investigation,customer,it's 9595912. the tracking was marked delivere...,4
4,6794-8660-4606-3216,e-commerce & retail,delivery investigation,customer james bailey reported a smart watch s...,delivery investigation,agent,"let me pull that up right away. okay, i see th...",5


In [38]:
! pip install anthropic

Collecting anthropic
  Downloading anthropic-0.78.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.78.0-py3-none-any.whl (405 kB)
Installing collected packages: anthropic
Successfully installed anthropic-0.78.0



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
