# Baseline Conversation Analysis

This notebook loads conversation data from CSV, parses it into Conversation objects, processes them with OpenAI LLM analysis, and saves the results to JSON.

In [1]:
import sys
import os
import json
import pandas as pd
from datetime import datetime
from typing import List
from tqdm import tqdm

# Add utils to path
sys.path.append('utils')

# Import custom modules
from utils.conv.parser import ConversationParser
from utils.conv.conversation import Conversation, ConversationMap
from utils.llm import ConversationMapper

## 1. Load and Parse Conversations

In [2]:
# Initialize parser
parser = ConversationParser(
    csv_file_path='data/data.csv',
    time_threshold_minutes=30
)

print("Parsing conversations from CSV...")
conversations = parser.parse_conversations()

print(f"Parsed {len(conversations)} conversations")
print(f"Total users: {len(set(c.user_id for c in conversations))}")

Parsing conversations from CSV...
Parsed 2055 conversations
Total users: 150


## 2. Display Basic Statistics

In [3]:
# Get basic statistics
stats = parser.get_conversation_stats(conversations)

print("\n=== Conversation Statistics ===")
for key, value in stats.items():
    print(f"{key}: {value}")

# Display sample conversations
print("\n=== Sample Conversations ===")
for i, conv in enumerate(conversations[:3]):
    print(f"\nConversation {i+1}:")
    print(f"  ID: {conv.dialogue_id}")
    print(f"  User: {conv.user_id}")
    print(f"  Duration: {conv.duration_minutes:.1f} minutes")
    print(f"  Messages: {conv.message_count}")
    print(f"  Text preview: {conv.full_text[:100]}...")


=== Conversation Statistics ===
total_conversations: 2055
total_messages: 13194
unique_users: 150
avg_duration_minutes: 0.7400405515004055
avg_message_count: 6.420437956204379

=== Sample Conversations ===

Conversation 1:
  ID: 1
  User: 2147
  Duration: 0.1 minutes
  Messages: 3
  Text preview: Напиши, ФИО_1 пожелание "Хорошего дня!"
Вы хотите отправить сообщение ФИО_1 с пожеланием "Хорошего д...

Conversation 2:
  ID: 2
  User: 2147
  Duration: 0.2 minutes
  Messages: 3
  Text preview: Отправить сообщение
Сообщение с пожеланием "Хорошего дня!" успешно отправлено ФИО_1. Если нужно что-...

Conversation 3:
  ID: 3
  User: 4071
  Duration: 0.1 minutes
  Messages: 3
  Text preview: напиши кокориной елене спасибо
Вы хотите отправить сообщение ФИО_2 с текстом "спасибо" или записать ...


## 3. Process with OpenAI LLM Analysis

In [5]:
# Initialize conversation mapper
mapper = ConversationMapper()

# Process a small batch first for testing
test_batch_size = 5
test_conversations = conversations[:test_batch_size]

print(f"Processing {len(test_conversations)} conversations with OpenAI...")

# Process conversations with progress bar
analyzed_conversations = []
for conv in tqdm(test_conversations, desc="Analyzing conversations"):
    analyzed_conv = mapper.map_conversation(conv)
    analyzed_conversations.append(analyzed_conv)

print(f"Successfully analyzed {len(analyzed_conversations)} conversations")

Processing 5 conversations with OpenAI...


Analyzing conversations: 100%|██████████| 5/5 [01:11<00:00, 14.27s/it]

Successfully analyzed 5 conversations





## 4. Display Analysis Results

In [6]:
# Display analysis results for the first conversation
if analyzed_conversations and analyzed_conversations[2].analysis:
    sample_analysis = analyzed_conversations[2].analysis
    
    print("\n=== Sample Analysis Results ===")
    print(f"Sentiment: {sample_analysis.sentiment} (confidence: {sample_analysis.sentiment_confidence:.2f})")
    print(f"Emotions: {sample_analysis.emotions}")
    print(f"Problems: {sample_analysis.problems}")
    print(f"Problem Severity: {sample_analysis.problem_severity}/10")
    print(f"Categories: {sample_analysis.category}")
    print(f"Intent: {sample_analysis.intent}")
    print(f"Feedback: {sample_analysis.feedback}")
    print(f"Suggestions: {sample_analysis.suggestions}")
    print(f"Is Successful: {sample_analysis.is_successful}")


=== Sample Analysis Results ===
Sentiment: SentimentType.NEUTRAL (confidence: 0.85)
Emotions: []
Problems: [<ProblemType.USER_CONFUSION: 'user_confusion'>]
Problem Severity: 3/10
Categories: [<CategoryType.COMMUNICATION: 'communication'>, <CategoryType.OTHER: 'other'>]
Intent: [<IntentType.GENERAL_INFO: 'general_info'>]
Feedback: []
Suggestions: []
Is Successful: False


In [None]:
# Install dependencies for lightweight LLM providers (run once)
# Uncomment the provider you want to use:

# For Hugging Face models (FREE, local inference)
# !pip install transformers torch accelerate bitsandbytes

# For Groq API (FAST, requires free API key)
# !pip install groq

# For Ollama (LOCAL, requires Ollama server)
# !pip install ollama

print("Dependencies installation cell - uncomment the provider you want to use above")

In [None]:
# Enhanced mass processing with support for multiple LLM providers
import asyncio
import time
from utils.conv_processor import ConversationProcessor

# ==================== LLM CONFIGURATION ====================
# Choose your LLM provider and configure parameters

# Option 1: OpenAI (original - requires API key and credits)
# llm_config = {
#     'llm_provider': 'openai',
#     'openai_client': None  # Uses default OpenAI client
# }

# Option 2: Hugging Face (FREE - runs locally on Kaggle)
llm_config = {
    'llm_provider': 'huggingface',
    'model_name': 'microsoft/Phi-3-mini-4k-instruct'  # Fast 3.8B model
    # Alternative options:
    # 'model_name': 'google/gemma-2b-it'  # Even faster 2B model
    # 'model_name': 'microsoft/DialoGPT-medium'  # Fastest 345M model
}

# Option 3: Groq (FAST - requires free API key)
# llm_config = {
#     'llm_provider': 'groq',
#     'api_key': 'your_groq_api_key_here',
#     'model_name': 'llama3-8b-8192'  # Fast inference
# }

# Option 4: Ollama (LOCAL - requires Ollama running)
# llm_config = {
#     'llm_provider': 'ollama',
#     'model_name': 'llama3.1:8b',
#     'base_url': 'http://localhost:11434'
# }

# ==================== PROCESSING CONFIGURATION ====================
processing_config = {
    'max_concurrent_requests': 5,  # Reduce for lighter models
    'batch_size': 25,  # Smaller batches for stability
    'progress_file': f"processing_progress_{llm_config['llm_provider']}.json",
    'results_file': f"analyzed_conversations_{llm_config['llm_provider']}.json"
}

# ==================== INITIALIZE AND RUN ====================
print(f"Using LLM provider: {llm_config['llm_provider']}")
print(f"Model: {llm_config.get('model_name', 'default')}")

# Initialize processor
processor = ConversationProcessor(
    **processing_config,
    **llm_config
)

# Handle resume processing
should_resume, start_index = processor.resume_processing_prompt(len(conversations))

# Get user confirmation
if processor.get_user_confirmation(len(conversations), start_index):
    print(f"Starting processing with {llm_config['llm_provider']} provider...")
    print(f"Processing {len(conversations) - start_index} conversations...")
    
    # Run the concurrent processing
    start_time = time.time()
    results = await processor.process_conversations_concurrent(conversations, start_index)
    end_time = time.time()
    
    print(f"\n=== Processing Complete ===")
    print(f"Provider: {llm_config['llm_provider']}")
    print(f"Total time: {end_time - start_time:.2f} seconds")
    print(f"Results saved to: {processor.results_file}")
    print(f"Average time per conversation: {(end_time - start_time) / (len(conversations) - start_index):.2f} seconds")
    
    # Clean up progress file
    processor.cleanup_progress()
    
    analyzed_conversations = results
    
else:
    print("Processing cancelled")
    analyzed_conversations = []

## 6. Save Results to JSON

In [None]:
# Convert to JSON-serializable format
def conversation_to_dict(conv: Conversation) -> dict:
    """Convert Conversation object to dictionary for JSON serialization"""
    result = {
        'dialogue_id': conv.dialogue_id,
        'user_id': conv.user_id,
        'start_time': conv.start_time.isoformat(),
        'end_time': conv.end_time.isoformat(),
        'duration_minutes': conv.duration_minutes,
        'message_count': conv.message_count,
        'full_text': conv.full_text,
        'departments': conv.departments,
        'analysis': None
    }
    
    if conv.analysis:
        result['analysis'] = {
            'sentiment': conv.analysis.sentiment,
            'sentiment_confidence': conv.analysis.sentiment_confidence,
            'emotions': conv.analysis.emotions,
            'problems': conv.analysis.problems,
            'problem_severity': conv.analysis.problem_severity,
            'problem_extra_info': conv.analysis.problem_extra_info,
            'success_indicators': conv.analysis.success_indicators,
            'failure_indicators': conv.analysis.failure_indicators,
            'category': conv.analysis.category,
            'intent': conv.analysis.intent,
            'feedback': conv.analysis.feedback,
            'suggestions': conv.analysis.suggestions,
            'analysis_explanation': conv.analysis.analysis_explanation
        }
    
    return result

# Convert all conversations to dictionaries
conversations_dict = [conversation_to_dict(conv) for conv in analyzed_conversations]

# Create output data structure
output_data = {
    'metadata': {
        'total_conversations': len(conversations_dict),
        'processed_at': datetime.now().isoformat(),
        'source_file': 'data/data.csv',
        'time_threshold_minutes': 30
    },
    'conversations': conversations_dict
}

# Save to JSON file
output_filename = f'analyzed_conversations_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"\nResults saved to: {output_filename}")
print(f"Total conversations processed: {len(conversations_dict)}")

## 7. Analysis Summary

In [None]:
# Generate summary statistics
analyzed_with_results = [conv for conv in analyzed_conversations if conv.analysis]

if analyzed_with_results:
    print("\n=== Analysis Summary ===")
    print(f"Conversations with analysis: {len(analyzed_with_results)}")
    
    # Sentiment distribution
    sentiment_counts = {}
    for conv in analyzed_with_results:
        sentiment = conv.analysis.sentiment
        sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
    
    print("\nSentiment Distribution:")
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(analyzed_with_results)) * 100
        print(f"  {sentiment}: {count} ({percentage:.1f}%)")
    
    # Problem severity distribution
    severities = [conv.analysis.problem_severity for conv in analyzed_with_results]
    avg_severity = sum(severities) / len(severities)
    print(f"\nAverage Problem Severity: {avg_severity:.2f}/10")
    
    # Category distribution
    category_counts = {}
    for conv in analyzed_with_results:
        for category in conv.analysis.category:
            category_counts[category] = category_counts.get(category, 0) + 1
    
    print("\nTop Categories:")
    for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {category}: {count}")
else:
    print("No conversations were successfully analyzed.")

## 8. Data Validation

In [None]:
# Validate the saved JSON file
print("\n=== Data Validation ===")
try:
    with open(output_filename, 'r', encoding='utf-8') as f:
        loaded_data = json.load(f)
    
    print(f"✓ JSON file is valid")
    print(f"✓ Contains {len(loaded_data['conversations'])} conversations")
    print(f"✓ Metadata: {loaded_data['metadata']}")
    
    # Check if analysis data is present
    analyzed_count = sum(1 for conv in loaded_data['conversations'] if conv['analysis'])
    print(f"✓ {analyzed_count} conversations have analysis data")
    
except Exception as e:
    print(f"✗ Error validating JSON file: {e}")