In [1]:
import pandas as pd

In [2]:
import json
import pandas as pd

def load_and_structure_data(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Check if data is a dictionary containing a list
    if isinstance(data, dict):
        # Look for the list within the dictionary keys
        # Replace 'conversations' with the actual top-level key in your JSON
        conversations_list = data.get('conversations', []) 
    else:
        conversations_list = data

    flattened_records = []
    
    for conversation in conversations_list:
        # Ensure we are dealing with a dictionary [cite: 19]
        if not isinstance(conversation, dict):
            continue
            
        call_id = conversation.get('call_id')
        outcome = conversation.get('outcome_event')
        
        # Flattening turns to maintain speaker labels and sequence 
        for turn in conversation.get('transcript', []):
            flattened_records.append({
                "call_id": call_id,
                "outcome_event": outcome,
                "speaker": turn.get('speaker'),
                "text": turn.get('text'),
                "turn_id": turn.get('turn_id')
            })
            
    return pd.DataFrame(flattened_records)

# Execution
df = load_and_structure_data('Conversational_Transcript_Dataset.json')
print(df.head())

Empty DataFrame
Columns: []
Index: []


In [3]:
def load_and_structure_data(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Debug: Check what the top-level structure is
    print(f"Data type: {type(data)}")
    
    # If it's a dict, find the list of conversations
    if isinstance(data, dict):
        # Common keys are 'conversations', 'data', or 'calls'
        for key in ['conversations', 'data', 'calls']:
            if key in data:
                conversations_list = data[key]
                break
        else:
            # If no key found, check if the dict itself contains one conversation
            conversations_list = [data] if 'call_id' in data else []
    else:
        conversations_list = data

    flattened_records = []
    
    for conversation in conversations_list:
        call_id = conversation.get('call_id')
        outcome = conversation.get('outcome_event')
        
        # Access the transcript list [cite: 8, 19]
        transcript = conversation.get('transcript', [])
        
        for turn in transcript:
            flattened_records.append({
                "call_id": call_id,
                "outcome_event": outcome,
                "speaker": turn.get('speaker'),
                "text": turn.get('text'),
                "turn_id": turn.get('turn_id')
            })
            
    return pd.DataFrame(flattened_records)

# Run this and check the output
df = load_and_structure_data('Conversational_Transcript_Dataset.json')
if df.empty:
    print("DataFrame is still empty. Please check the JSON keys.")
else:
    print(f"Successfully loaded {len(df)} turns.")
    print(df.head())

Data type: <class 'dict'>
DataFrame is still empty. Please check the JSON keys.


In [4]:
import json
import pandas as pd

def fix_and_load_data(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Access the main list
    convs = data.get('transcripts', [])
    
    if not convs:
        print(f"Key 'transcripts' not found. Available keys are: {list(data.keys())}")
        return pd.DataFrame()

    # DEBUG: Let's see what one conversation looks like
    print("--- Structure Discovery ---")
    sample = convs[0]
    print(f"Conversation Keys: {list(sample.keys())}")
    
    # Find the transcript key inside the conversation
    # It might be 'transcript', 'dialogue', 'turns', etc.
    t_key = next((k for k in sample.keys() if isinstance(sample[k], list)), None)
    print(f"Likely transcript key: {t_key}")
    
    flattened_records = []
    for conversation in convs:
        # We use .get() with fallback to handle potential missing data 
        cid = conversation.get('call_id') or conversation.get('id')
        out = conversation.get('outcome_event') or conversation.get('outcome')
        
        # Use the discovered transcript key
        turns = conversation.get(t_key, [])
        
        for turn in turns:
            flattened_records.append({
                "call_id": cid,
                "outcome_event": out,
                "speaker": turn.get('speaker') or turn.get('role'),
                "text": turn.get('text') or turn.get('content'),
                "turn_id": turn.get('turn_id') or turn.get('index')
            })
            
    return pd.DataFrame(flattened_records)

df = fix_and_load_data('Conversational_Transcript_Dataset.json')
if not df.empty:
    print("\n--- Success! ---")
    print(df.head())

--- Structure Discovery ---
Conversation Keys: ['transcript_id', 'time_of_interaction', 'domain', 'intent', 'reason_for_call', 'conversation']
Likely transcript key: conversation

--- Success! ---
  call_id outcome_event   speaker  \
0    None          None     Agent   
1    None          None  Customer   
2    None          None     Agent   
3    None          None  Customer   
4    None          None     Agent   

                                                text turn_id  
0  Hello, thank you for contacting BuyNow. This i...    None  
1  Hello, I'm calling about an order that shows d...    None  
2  I'm sorry to hear that. I'll definitely help y...    None  
3  It's 9595912. The tracking was marked delivere...    None  
4  Let me pull that up right away. Okay, I see th...    None  


In [5]:
import json
import pandas as pd

def load_and_structure_data(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    convs = data.get('transcripts', [])
    flattened_records = []
    
    for conversation in convs:
        # Mapping your specific keys
        call_id = conversation.get('transcript_id') # From your 'transcript_id'
        outcome = conversation.get('intent') # Or 'reason_for_call' [cite: 5]
        
        turns = conversation.get('conversation', []) # From your 'conversation' key
        
        for turn in turns:
            flattened_records.append({
                "call_id": call_id,
                "outcome_event": outcome,
                "speaker": turn.get('speaker'),
                "text": turn.get('text'),
                "turn_id": turn.get('turn_id')
            })
            
    return pd.DataFrame(flattened_records)

df = load_and_structure_data('Conversational_Transcript_Dataset.json')
print(f"Loaded {len(df)} turns across {df['call_id'].nunique()} unique calls.")

Loaded 84465 turns across 5037 unique calls.


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class EvidenceIndexer:
    def __init__(self):
        # TF-IDF is built into sklearn (no extra installation usually needed)
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = None
        self.metadata = None

    def create_index(self, dataframe):
        # We must work over a corpus of conversational transcripts 
        self.metadata = dataframe.reset_index(drop=True)
        # Handle noisy conversational data by filling NAs [cite: 9]
        text_data = self.metadata['text'].fillna("")
        
        # This builds the indexing mechanism 
        self.tfidf_matrix = self.vectorizer.fit_transform(text_data)
        print(f"Index successfully built with {self.tfidf_matrix.shape[0]} dialogue turns.")

    def get_evidence(self, query, top_k=3):
        # Convert user query to the same TF-IDF space
        query_vec = self.vectorizer.transform([query])
        
        # Calculate similarity between query and all dialogue turns
        cosine_sim = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        
        # Get the top_k most relevant indices
        relevant_indices = cosine_sim.argsort()[-top_k:][::-1]
        
        # Extract specific dialogue spans that serve as supporting evidence 
        results = self.metadata.iloc[relevant_indices].copy()
        
        # Ensure the output is traceable back to concrete evidence [cite: 9]
        return results[['call_id', 'speaker', 'text', 'outcome_event']]

# --- EXECUTION ---
indexer = EvidenceIndexer()
indexer.create_index(df)

# Test with a query to ensure it returns identifiable portions of data [cite: 22]
print(indexer.get_evidence("customer delivery delay complaint"))

Index successfully built with 84465 dialogue turns.
                   call_id speaker  \
52896  4265-9695-7361-8662   Agent   
9775   6043-7841-9619-4424   Agent   
1819   7038-2056-8606-6726   Agent   

                                                    text  \
52896  I'm sorry to hear about the delay. Let me chec...   
9775   I'm sorry to hear about the delay. Let me chec...   
1819   I'm sorry to hear about the delay. Let me chec...   

                                         outcome_event  
52896  Multiple Issues - Order Status & Account Access  
9775   Multiple Issues - Order Status & Account Access  
1819   Multiple Issues - Order Status & Account Access  
