In [None]:
# Install required packages
!pip install langdetect==1.0.9 pandas


# 01 - Message Parsing and Normalization

This notebook handles the extraction, cleaning, and normalization of messages from multiple platforms (WhatsApp, Telegram).

## Overview
- **Extract messages** from different export formats (JSON for Telegram, TXT for WhatsApp)
- **Clean and normalize** multilingual text while preserving emojis and essential punctuation
- **Detect language** and classify conversations (French/English)
- **Clean usernames** and filter system messages
- **Output standardized CSV** format with timestamp, sender, message columns

## Input Data
- Raw exports from messaging platforms in `data/raw/` folder
- **Telegram**: JSON export files (`data/raw/telegram/result.json`)
- **WhatsApp**: TXT export files (`data/raw/whatsapp/*.txt`)

## Output Data
- **Standardized CSV files** with columns: `timestamp`, `sender`, `message`
- One CSV per conversation in `data/cleaned/` folder
- **Language detection** per conversation (`conversation_languages.csv`)
- **Message statistics** and processing summary

## User Configuration
- Uses `config/user_config.json` to identify user's messages
- Automatically creates default config if not found
- Supports multiple user pseudonyms/identifiers


In [None]:
# Telegram parsing functions
from typing import Dict, Any, List
import json
import os
import re
from datetime import datetime

def load_user_config(config_path: str = 'config/user_config.json') -> Dict[str, Any]:
    """Load user configuration for identifying the user's messages."""
    if not os.path.exists(config_path):
        # Create default config if it doesn't exist
        default_config = {
            "user_identifiers": ["Antonin", "antonin", "Anto", "anto"],
            "user_label": "A:",
            "other_label": "B:",
            "description": "Configuration for identifying the user's messages in chat data."
        }
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, 'w', encoding='utf-8') as f:
            json.dump(default_config, f, indent=2)
        print(f"Created default user config at {config_path}")
        return default_config
    
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    print(f"Loaded user config: {config['user_identifiers']} -> {config['user_label']}")
    return config

def parse_telegram_text(text_data) -> str:
    """Parse Telegram text which can be a string or list of entities."""
    if isinstance(text_data, str):
        return text_data
    elif isinstance(text_data, list):
        # Extract text from text_entities
        full_text = ""
        for entity in text_data:
            if isinstance(entity, dict):
                if entity.get('type') == 'plain' and 'text' in entity:
                    full_text += entity['text']
                elif 'text' in entity:
                    full_text += entity['text']
            elif isinstance(entity, str):
                full_text += entity
        return full_text
    else:
        return str(text_data) if text_data else ""

def parse_telegram_message(msg: Dict[str, Any]) -> Dict[str, Any]:
    """Parse a single Telegram message into standardized format."""
    if msg.get('type') != 'message':
        return None
    
    # Parse text content
    text = parse_telegram_text(msg.get('text', ''))
    if not text or len(text.strip()) < 3:
        return None
    
    # Parse timestamp
    try:
        if 'date' in msg:
            dt = datetime.fromisoformat(msg['date'].replace('Z', '+00:00'))
        elif 'date_unixtime' in msg:
            dt = datetime.fromtimestamp(int(msg['date_unixtime']))
        else:
            return None
    except (ValueError, TypeError):
        return None
    
    return {
        'timestamp': dt.isoformat(),
        'sender': (msg.get('from') or 'Unknown').strip(),
        'message': text.strip()
    }

def load_telegram_conversation(chat_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Load and parse a Telegram chat conversation."""
    messages = []
    
    if 'messages' not in chat_data:
        return messages
    
    for msg in chat_data['messages']:
        parsed = parse_telegram_message(msg)
        if parsed:
            messages.append(parsed)
    
    return messages

def load_all_telegram_conversations(file_path: str = 'data/raw/telegram/result.json') -> Dict[str, List[Dict[str, Any]]]:
    """Load all Telegram conversations from the export file."""
    conversations = {}
    
    if not os.path.exists(file_path):
        print(f"Telegram file not found: {file_path}")
        return conversations
    
    print(f"Loading Telegram conversations from {file_path}...")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if 'chats' not in data or 'list' not in data['chats']:
            print("No chats found in Telegram export")
            return conversations
        
        chats = data['chats']['list']
        print(f"Found {len(chats)} chats in Telegram export")
        
        for chat in chats:
            chat_name = chat.get('name', f"chat_{chat.get('id', 'unknown')}")
            chat_type = chat.get('type', 'unknown')
            
            # Skip system chats and very small conversations
            if chat_type in ['personal_chat'] and chat.get('id') == 777000:  # Telegram system
                continue
            
            messages = load_telegram_conversation(chat)
            
            # Only process conversations with reasonable message count
            if len(messages) < 10:
                continue
            
            # Use a clean filename
            clean_name = re.sub(r'[^\w\s-]', '', chat_name).strip()
            clean_name = re.sub(r'[-\s]+', '_', clean_name)
            filename = f"{clean_name}_telegram.txt"
            
            conversations[filename] = messages
            print(f"  Loaded {filename}: {len(messages)} messages")
            
    except Exception as e:
        print(f"Error loading Telegram conversations: {e}")
    
    return conversations

def is_french_conversation(messages: List[Dict[str, Any]], sample_size: int = 20) -> bool:
    """Check if a conversation is in French using langdetect library."""
    if not messages:
        return False
    
    # Sample messages for detection
    sample_messages = messages[:min(sample_size, len(messages))]
    sample_texts = []
    
    for msg in sample_messages:
        text = msg.get('message', '').strip()
        if len(text) >= 3:
            sample_texts.append(text)
    
    if not sample_texts:
        return False
    
    # Use langdetect to detect language
    try:
        from langdetect import detect, DetectorFactory
        DetectorFactory.seed = 0  # For consistent results
        
        # Detect language of each sample text
        languages = []
        for text in sample_texts:
            try:
                lang = detect(text)
                languages.append(lang)
            except:
                continue  # Skip texts that can't be detected
        
        if not languages:
            return False
        
        # Count French detections
        french_count = languages.count('fr')
        french_ratio = french_count / len(languages)
        
        # Consider French if at least 60% of messages are detected as French
        is_french = french_ratio >= 0.6
        
        print(f"    Languages detected: {languages[:10]}... (showing first 10)")
        print(f"    French ratio: {french_ratio:.2f}, Result: {is_french}")
        
        return is_french
        
    except Exception as e:
        print(f"    Error in language detection: {e}")
        return False


In [None]:
# Load Telegram conversations and filter for French
print("=== LOADING TELEGRAM CONVERSATIONS ===")
telegram_conversations = load_all_telegram_conversations()

print(f"\nLoaded {len(telegram_conversations)} Telegram conversations")

# Filter for French conversations with more detailed logging
print("\nFiltering for French conversations...")
french_telegram_conversations = {}

for filename, messages in telegram_conversations.items():
    print(f"Testing {filename} with {len(messages)} messages...")
    
    # Show sample messages for debugging
    sample_messages = messages[:5]
    for i, msg in enumerate(sample_messages):
        print(f"  Sample {i+1}: {msg['message'][:80]}...")
    
    is_french = is_french_conversation(messages)
    print(f"  French detection result: {is_french}")
    
    if is_french:
        french_telegram_conversations[filename] = messages
        print(f"  ✓ ADDED {filename}: {len(messages)} messages (French)")
    else:
        print(f"  ✗ SKIPPED {filename}: Not detected as French")

print(f"\nFound {len(french_telegram_conversations)} French Telegram conversations")

# Process French Telegram conversations
# Note: Processing will be done later when all functions are defined
if french_telegram_conversations:
    print("\nTelegram conversations loaded (processing will be done later)...")
    for filename, messages in french_telegram_conversations.items():
        print(f"  ✓ {filename}: {len(messages)} messages")
else:
    print("\nWARNING: No French Telegram conversations found!")

# Initialize all_conversations if not already defined
if 'all_conversations' not in globals():
    all_conversations = {}

# Add Telegram conversations to the main conversations dict
all_conversations.update(french_telegram_conversations)

print(f"\n=== UPDATED CONVERSATION SUMMARY ===")
print(f"Total conversations (WhatsApp + French Telegram): {len(all_conversations)}")
total_messages = sum(len(msgs) for msgs in all_conversations.values())
print(f"Total messages: {total_messages:,}")

# Show breakdown by source
whatsapp_count = len([f for f in all_conversations.keys() if not f.endswith('_telegram.txt')])
telegram_count = len([f for f in all_conversations.keys() if f.endswith('_telegram.txt')])
print(f"WhatsApp conversations: {whatsapp_count}")
print(f"Telegram conversations: {telegram_count}")


In [None]:
import os
import re
import json
from datetime import datetime
from typing import List, Dict, Any
import glob

def parse_whatsapp_message(line: str) -> Dict[str, Any]:
    """
    Parse a single WhatsApp message line into structured data.
    
    Format: [DD/MM/YYYY, HH:MM:SS] Sender: Message
    """
    # Regex pattern to match WhatsApp message format
    pattern = r'\[(\d{2}/\d{2}/\d{4}), (\d{2}:\d{2}:\d{2})\] ([^:]+): (.+)'
    match = re.match(pattern, line.strip())
    
    if not match:
        return None
    
    date_str, time_str, sender, message = match.groups()
    
    # Parse datetime
    try:
        dt = datetime.strptime(f"{date_str} {time_str}", "%d/%m/%Y %H:%M:%S")
    except ValueError:
        return None
    
    return {
        'timestamp': dt.isoformat(),
        'sender': sender.strip(),
        'message': message.strip()
    }

def load_whatsapp_conversation(file_path: str) -> List[Dict[str, Any]]:
    """
    Load and parse a complete WhatsApp conversation file.
    Handles multi-line messages and system messages.
    """
    messages = []
    current_message = None
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
                
            # Check if this line starts a new message
            parsed = parse_whatsapp_message(line)
            
            if parsed:
                # Save previous message if exists
                if current_message:
                    messages.append(current_message)
                
                # Start new message
                current_message = parsed
            else:
                # This is a continuation of the previous message
                if current_message and not line.startswith('['):
                    current_message['message'] += ' ' + line
    
    # Don't forget the last message
    if current_message:
        messages.append(current_message)
    
    return messages

def load_all_whatsapp_conversations(data_dir: str = 'data/raw/whatsapp') -> Dict[str, List[Dict[str, Any]]]:
    """
    Load all WhatsApp conversations from the data directory.
    Returns a dictionary with filename as key and messages as value.
    """
    conversations = {}
    
    # Find all .txt files in the WhatsApp directory
    whatsapp_files = glob.glob(os.path.join(data_dir, '*.txt'))
    
    for file_path in whatsapp_files:
        filename = os.path.basename(file_path)
        print(f"Loading {filename}...")
        
        try:
            messages = load_whatsapp_conversation(file_path)
            conversations[filename] = messages
            print(f"  Loaded {len(messages)} messages")
        except Exception as e:
            print(f"  Error loading {filename}: {e}")
    
    return conversations

# Load all WhatsApp conversations
print("Loading all WhatsApp conversations...")
all_conversations = load_all_whatsapp_conversations()

# Display summary
print(f"\nLoaded {len(all_conversations)} conversations:")
for filename, messages in all_conversations.items():
    print(f"  {filename}: {len(messages)} messages")

# Show sample messages from each conversation
print("\nSample messages from each conversation:")
for filename, messages in all_conversations.items():
    print(f"\n--- {filename} ---")
    for i, msg in enumerate(messages[:3]):  # Show first 3 messages
        print(msg)
        print(f"{i+1}. [{msg['timestamp']}] {msg['sender']}: {msg['message'][:100]}...")
    if len(messages) > 3:
        print(f"... and {len(messages) - 3} more messages")

In [None]:
# Data Analysis and Statistics

def analyze_conversations(conversations: Dict[str, List[Dict[str, Any]]]) -> Dict[str, Any]:
    """
    Analyze the loaded conversations and provide statistics.
    """
    stats = {
        'total_conversations': len(conversations),
        'total_messages': 0,
        'senders': set(),
        'date_range': {'start': None, 'end': None},
        'messages_per_conversation': {},
        'senders_per_conversation': {},
        'platforms': set()
    }
    
    all_messages = []
    
    for filename, messages in conversations.items():
        stats['total_messages'] += len(messages)
        stats['messages_per_conversation'][filename] = len(messages)
        
        # Collect all senders
        senders = set(msg['sender'] for msg in messages)
        stats['senders_per_conversation'][filename] = list(senders)
        stats['senders'].update(senders)
        
        # Platform is now stored at conversation level, not message level
        # All WhatsApp conversations have platform "whatsapp"
        stats['platforms'].add('whatsapp')

        # Date range
        if messages:
            timestamps = [msg['timestamp'] for msg in messages]
            if stats['date_range']['start'] is None:
                stats['date_range']['start'] = min(timestamps)
                stats['date_range']['end'] = max(timestamps)
            else:
                stats['date_range']['start'] = min(stats['date_range']['start'], min(timestamps))
                stats['date_range']['end'] = max(stats['date_range']['end'], max(timestamps))
        
        all_messages.extend(messages)
    
    # Convert sets to lists for JSON serialization
    stats['senders'] = list(stats['senders'])
    stats['platforms'] = list(stats['platforms'])
    
    return stats, all_messages

# Analyze the conversations
print("Analyzing conversations...")
stats, all_messages = analyze_conversations(all_conversations)

print(f"\n=== CONVERSATION STATISTICS ===")
print(f"Total conversations: {stats['total_conversations']}")
print(f"Total messages: {stats['total_messages']}")
print(f"Date range: {stats['date_range']['start']} to {stats['date_range']['end']}")
print(f"Platforms: {', '.join(stats['platforms'])}")
print(f"Unique senders: {len(stats['senders'])}")

print(f"\n=== MESSAGES PER CONVERSATION ===")
for filename, count in stats['messages_per_conversation'].items():
    print(f"  {filename}: {count:,} messages")

print(f"\n=== SENDERS PER CONVERSATION ===")
for filename, senders in stats['senders_per_conversation'].items():
    print(f"  {filename}: {', '.join(senders)}")


In [None]:
# Message Processing Functions
import unicodedata
import re

def sanitize_message(message: str) -> str:
    """
    Sanitize message text by removing special characters while preserving:
    - Emojis (Unicode emoji characters)
    - Numbers and basic punctuation
    - Letters and spaces
    - Common punctuation marks (.,!?;:)
    
    Removes:
    - Control characters
    - Special symbols
    - Non-printable characters
    - Excessive whitespace
    """
    if not message:
        return message
    
    # Keep emojis, letters, numbers, spaces, and basic punctuation
    # This regex keeps:
    # - \w: letters, digits, underscore
    # - \s: whitespace
    # - Basic punctuation: .,!?;:()[]{}\"'
    # - Unicode emoji ranges
    # - Common currency symbols
    # - Basic math symbols
    
    # Define what to keep
    keep_pattern = r'[\w\s.,!?;:()\[\]{}"\'€$£¥₹+=\-*/%<>@#&~`|\\]'
    
    # Also keep emojis (Unicode emoji ranges)
    emoji_pattern = r'[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002600-\U000026FF]|[\U00002700-\U000027BF]'
    
    # Combine patterns
    sanitized = ''
    for char in message:
        if re.match(keep_pattern, char) or re.match(emoji_pattern, char):
            sanitized += char
        else:
            # Replace with space for word separation
            sanitized += ' '
    
    # Clean up multiple spaces and strip
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    
    return sanitized

def clean_username(username: str) -> str:
    """
    Clean username by removing special characters, emojis, and normalizing unicode.
    Keeps only letters, numbers, and spaces.
    """
    if not username:
        return username
    
    # Normalize unicode characters (e.g., é -> e)
    username = unicodedata.normalize('NFD', username)
    
    # Remove all special characters, keep only alphanumeric and spaces
    cleaned = ''.join(c for c in username if c.isalnum() or c.isspace())
    
    # Clean up multiple spaces and strip
    cleaned = ' '.join(cleaned.split())
    
    # If the result is empty or too short, use a fallback
    if len(cleaned.strip()) < 1:
        cleaned = f"user_{hash(username) % 10000}"
    
    return cleaned.strip()

def sanitize_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Sanitize all message text by removing special characters while preserving emojis, numbers, and text.
    """
    for msg in messages:
        if 'message' in msg:
            sanitized_message = sanitize_message(msg['message'])
            msg['message'] = sanitized_message
    
    return messages

def clean_usernames_in_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Clean usernames in all messages by removing special characters.
    """
    for msg in messages:
        if 'sender' in msg:
            cleaned_sender = clean_username(msg['sender'])
            msg['sender'] = cleaned_sender
    
    return messages


In [None]:
# Simplified Language Detection - Per Conversation Approach
import random
from collections import Counter
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Set seed for consistent results
DetectorFactory.seed = 0

def detect_conversation_language(messages: List[Dict[str, Any]], sample_size: int = 100) -> str:
    """
    Detect the language of a conversation by sampling ~100 messages.
    Uses only the external langdetect library for detection.
    """
    if not messages:
        return 'other'
    
    # Sample messages for language detection (limit to ~100)
    sample_messages = random.sample(messages, min(sample_size, len(messages)))
    
    # Collect text from sampled messages
    sample_texts = []
    for msg in sample_messages:
        text = msg['message'].strip()
        if len(text) >= 3:  # Only use messages with at least 3 characters
            sample_texts.append(text)
    
    if not sample_texts:
        return 'other'
    
    # Detect language for each sample using external library only
    detected_languages = []
    for text in sample_texts:
        try:
            detected_lang = detect(text)
            if detected_lang in ['fr', 'en']:
                detected_languages.append(detected_lang)
        except LangDetectException:
            continue
    
    if not detected_languages:
        return 'other'
    
    # Return the most common language
    most_common = Counter(detected_languages).most_common(1)[0][0]
    return most_common

# Detect language for each conversation
print("Detecting language for each conversation...")
conversation_languages = {}
for filename, messages in all_conversations.items():
    print(f"Detecting language for {filename}...")
    language = detect_conversation_language(messages)
    conversation_languages[filename] = language
    print(f"  Detected language: {language}")

# Filter messages (without individual language detection)
print("\nFiltering messages...")
filtered_messages = []

for msg in all_messages:
    message_text = msg['message']
    
    # Skip if too short
    if len(message_text.strip()) < 3:
        continue
        
    # Skip system messages
    if any(system_msg in message_text.lower() for system_msg in [
        'messages and calls are end-to-end encrypted',
        'you created group',
        'added',
        'removed',
        'left',
        'omitted'
    ]):
        continue
        
    # Skip media messages
    if any(media_msg in message_text.lower() for media_msg in [
        'image omitted',
        'video omitted',
        'audio omitted',
        'document omitted',
        'sticker omitted'
    ]):
        continue
        
    filtered_messages.append(msg)

# No need to add language to individual messages - will be stored at conversation level

print(f"Original messages: {len(all_messages):,}")
print(f"Filtered messages: {len(filtered_messages):,}")
print(f"Filtered out: {len(all_messages) - len(filtered_messages):,} messages")

# Language distribution by conversation
language_counts = Counter(conversation_languages.values())
print(f"\n=== LANGUAGE DISTRIBUTION BY CONVERSATION ===")
for lang, count in language_counts.most_common():
    percentage = (count / len(conversation_languages)) * 100
    print(f"  {lang}: {count} conversations ({percentage:.1f}%)")

# Show sample messages by conversation language
print(f"\n=== SAMPLE MESSAGES BY CONVERSATION LANGUAGE ===")
for lang in ['fr', 'en', 'other']:
    if lang in language_counts:
        # Find a conversation with this language
        for filename, conv_lang in conversation_languages.items():
            if conv_lang == lang:
                sample_messages = [msg for msg in all_conversations[filename] if msg in filtered_messages][:3]
                print(f"\n{lang.upper()} conversation ({filename}):")
                for i, msg in enumerate(sample_messages, 1):
                    print(f"  {i}. [{msg['sender']}]: {msg['message'][:80]}...")
                break


In [None]:
# Processing Functions
import uuid
import unicodedata
import re

def sanitize_message(message: str) -> str:
    """
    Sanitize message text by removing special characters while preserving:
    - Emojis (Unicode emoji characters)
    - Numbers and basic punctuation
    - Letters and spaces
    - Common punctuation marks (.,!?;:)
    
    Removes:
    - Control characters
    - Special symbols
    - Non-printable characters
    - Excessive whitespace
    """
    if not message:
        return message
    
    # Keep emojis, letters, numbers, spaces, and basic punctuation
    # This regex keeps:
    # - \w: letters, digits, underscore
    # - \s: whitespace
    # - Basic punctuation: .,!?;:()[]{}"'
    # - Unicode emoji ranges
    # - Common currency symbols
    # - Basic math symbols
    
    # Define what to keep
    keep_pattern = r'[\w\s.,!?;:()\[\]{}"\'€$£¥₹+=\-*/%<>@#&~`|\\]'
    
    # Also keep emojis (Unicode emoji ranges)
    emoji_pattern = r'[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF]|[\U00002600-\U000026FF]|[\U00002700-\U000027BF]'
    
    # Combine patterns
    sanitized = ''
    for char in message:
        if re.match(keep_pattern, char) or re.match(emoji_pattern, char):
            sanitized += char
        else:
            # Replace with space for word separation
            sanitized += ' '
    
    # Clean up multiple spaces and strip
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    
    return sanitized

def clean_username(username: str) -> str:
    """
    Clean username by removing special characters, emojis, and normalizing unicode.
    Keeps only letters, numbers, and spaces.
    """
    if not username:
        return username
    
    # Normalize unicode characters (e.g., é -> e)
    username = unicodedata.normalize('NFD', username)
    
    # Remove all special characters, keep only alphanumeric and spaces
    cleaned = ''.join(c for c in username if c.isalnum() or c.isspace())
    
    # Clean up multiple spaces and strip
    cleaned = ' '.join(cleaned.split())
    
    # If the result is empty or too short, use a fallback
    if len(cleaned.strip()) < 1:
        cleaned = f"user_{hash(username) % 10000}"
    
    return cleaned.strip()

def sanitize_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Sanitize all message text by removing special characters while preserving emojis, numbers, and text.
    """
    for msg in messages:
        if 'message' in msg:
            original_message = msg['message']
            sanitized_message = sanitize_message(original_message)
            msg['message'] = sanitized_message
            msg['original_message'] = original_message  # Keep original for reference
    
    return messages

def clean_usernames_in_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Clean usernames in all messages by removing special characters.
    """
    for msg in messages:
        if 'sender' in msg:
            original_sender = msg['sender']
            cleaned_sender = clean_username(original_sender)
            msg['sender'] = cleaned_sender
            msg['original_sender'] = original_sender  # Keep original for reference
    
    return messages


In [None]:
def parse_whatsapp_messages(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse WhatsApp messages from a file and return a list of message dictionaries.
    This is a wrapper around the existing load_whatsapp_conversation function.
    """
    return load_whatsapp_conversation(file_path)

def filter_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Filter messages to remove system messages, media messages, and very short messages.
    """
    filtered = []
    
    for msg in messages:
        message_text = msg['message']
        
        # Skip if too short
        if len(message_text.strip()) < 3:
            continue
            
        # Skip system messages
        if any(system_msg in message_text.lower() for system_msg in [
            'messages and calls are end-to-end encrypted',
            'you created group',
            'added',
            'removed',
            'left',
            'omitted'
        ]):
            continue
            
        # Skip media messages
        if any(media_msg in message_text.lower() for media_msg in [
            'image omitted',
            'video omitted',
            'audio omitted',
            'document omitted',
            'sticker omitted'
        ]):
            continue
            
        filtered.append(msg)
    
    return filtered


## Step 1: Load and Parse Conversations

First, we'll load all WhatsApp conversations, parse the messages, and detect the language for each conversation.

This step involves:
- Loading all `.txt` files from the WhatsApp data directory
- Parsing each file to extract structured message data
- Sanitizing message text (removing special characters while preserving emojis, numbers, and text)
- Cleaning usernames by removing special characters and emojis
- Filtering out system messages, media messages, and very short messages
- Detecting the primary language for each conversation


In [None]:
# Show message sanitization results
print(f"\n=== MESSAGE SANITIZATION RESULTS ===")
print("  Messages have been sanitized to preserve emojis, numbers, and text while removing special characters")

# Show username cleaning results
print(f"\n=== USERNAME CLEANING RESULTS ===")
unique_senders = set()

for msg in all_messages:
    if 'sender' in msg:
        unique_senders.add(msg['sender'])

print(f"Unique senders: {len(unique_senders)}")

print(f"\nAll unique senders:")
for sender in sorted(unique_senders):
    print(f"  {sender}")


In [None]:
# Load and parse all WhatsApp conversations
print("Loading WhatsApp conversations...")
whatsapp_files = glob.glob('data/raw/whatsapp/*.txt')
print(f"Found {len(whatsapp_files)} WhatsApp files: {[os.path.basename(f) for f in whatsapp_files]}")

# Initialize with existing conversations (includes Telegram)
all_conversations = all_conversations.copy()  # Keep existing Telegram conversations
all_messages = []

# Add all existing messages to the list
for messages in all_conversations.values():
    all_messages.extend(messages)

for file_path in whatsapp_files:
    filename = os.path.basename(file_path)
    print(f"\nProcessing {filename}...")
    
    # Parse messages
    messages = parse_whatsapp_messages(file_path)
    print(f"  Parsed {len(messages)} messages")
    
    # Sanitize message text (remove special characters, keep emojis/numbers/text)
    messages = sanitize_messages(messages)
    print(f"  Sanitized message text")
    
    # Clean usernames (remove special characters)
    messages = clean_usernames_in_messages(messages)
    print(f"  Cleaned usernames")
    
    # Filter messages
    filtered_messages = filter_messages(messages)
    print(f"  Filtered to {len(filtered_messages)} training messages")
    
    all_conversations[filename] = filtered_messages
    all_messages.extend(filtered_messages)

print(f"\n=== CONVERSATION SUMMARY ===")
print(f"Total conversations: {len(all_conversations)}")
print(f"Total messages: {len(all_messages):,}")

# Detect language for each conversation
print(f"\nDetecting conversation languages...")
conversation_languages = {}

for filename, messages in all_conversations.items():
    if messages:
        language = detect_conversation_language(messages)
        conversation_languages[filename] = language
        print(f"  {filename}: {language} ({len(messages)} messages)")
    else:
        conversation_languages[filename] = 'other'
        print(f"  {filename}: no messages")

# Show language distribution
language_counts = Counter(conversation_languages.values())
print(f"\nLanguage distribution:")
for lang, count in language_counts.items():
    print(f"  {lang}: {count} conversations")


In [None]:

# Show unique senders
print("Found unique senders:")
unique_senders = set(msg['sender'] for msg in all_messages)
for name in sorted(unique_senders):
    print(f"  {name}")

In [None]:
# CSV Output Functions
import pandas as pd

def save_conversations_to_csv(conversations: Dict[str, List[Dict[str, Any]]], 
                             conversation_languages: Dict[str, str],
                             output_dir: str = 'data/cleaned') -> None:
    """Save each conversation as a standardized CSV file with timestamp, sender, message columns."""
    os.makedirs(output_dir, exist_ok=True)
    
    for filename, messages in conversations.items():
        if not messages:
            continue
            
        # Create DataFrame with standardized columns
        df = pd.DataFrame(messages)
        
        # Ensure we have the required columns
        required_columns = ['timestamp', 'sender', 'message']
        if not all(col in df.columns for col in required_columns):
            print(f"Warning: Missing required columns in {filename}")
            continue
        
        # Select only the required columns and reorder
        df_standardized = df[required_columns].copy()
        
        # Sort by timestamp to ensure chronological order
        df_standardized = df_standardized.sort_values('timestamp').reset_index(drop=True)
        
        # Save as CSV
        csv_filename = filename.replace('.txt', '_messages.csv')
        csv_path = os.path.join(output_dir, csv_filename)
        df_standardized.to_csv(csv_path, index=False, encoding='utf-8')
        
        print(f"Saved {len(df_standardized)} messages to {csv_path}")
    
    # Note: We don't save a combined CSV as conversations should stay separate for windowing
    
    # Save language mapping
    language_mapping = {
        'conversation': list(conversation_languages.keys()),
        'language': list(conversation_languages.values())
    }
    language_df = pd.DataFrame(language_mapping)
    language_csv_path = os.path.join(output_dir, 'conversation_languages.csv')
    language_df.to_csv(language_csv_path, index=False, encoding='utf-8')
    print(f"Saved language mapping to {language_csv_path}")

def create_processing_summary(conversations: Dict[str, List[Dict[str, Any]]], 
                             conversation_languages: Dict[str, str]) -> None:
    """Create a summary of the processing results."""
    print(f"\n=== PROCESSING SUMMARY ===")
    print(f"Total conversations processed: {len(conversations)}")
    
    total_messages = sum(len(msgs) for msgs in conversations.values())
    print(f"Total messages: {total_messages:,}")
    
    # Language distribution
    language_counts = Counter(conversation_languages.values())
    print(f"\nLanguage distribution:")
    for lang, count in language_counts.items():
        print(f"  {lang}: {count} conversations")
    
    # Messages per conversation
    print(f"\nMessages per conversation:")
    for filename, messages in conversations.items():
        language = conversation_languages.get(filename, 'unknown')
        print(f"  {filename}: {len(messages):,} messages ({language})")
    
    print(f"\n=== OUTPUT FILES ===")
    print("Generated CSV files in data/cleaned/:")
    print("  - *_messages.csv: Individual conversation files")
    print("  - conversation_languages.csv: Language mapping")
    print("  - Note: Conversations are kept separate for proper windowing")


In [None]:
# Final Step: Save all cleaned data as standardized CSV files

print("=== SAVING CLEANED DATA AS CSV ===")

# Save conversations as standardized CSV files
save_conversations_to_csv(
    conversations=all_conversations,
    conversation_languages=conversation_languages,
    output_dir='data/cleaned'
)

# Create processing summary
create_processing_summary(all_conversations, conversation_languages)

print("\n=== SAVE COMPLETE ===")
print("All cleaned data has been saved to the 'data/cleaned' directory:")
print("- Individual conversation files: *_messages.csv")
print("- Language mapping: conversation_languages.csv")
print("- Note: Conversations are kept separate for proper windowing")
print("\nEach CSV contains standardized columns: timestamp, sender, message")


In [None]:
# Process Telegram conversations now that all functions are defined
print("=== PROCESSING TELEGRAM CONVERSATIONS ===")

if 'french_telegram_conversations' in globals() and french_telegram_conversations:
    print("Processing French Telegram conversations...")
    
    for filename, messages in french_telegram_conversations.items():
        print(f"Processing {filename}...")
        
        # Sanitize message text
        messages = sanitize_messages(messages)
        print(f"  Sanitized message text")
        
        # Clean usernames
        messages = clean_usernames_in_messages(messages)
        print(f"  Cleaned usernames")
        
        # Filter messages
        filtered_messages = filter_messages(messages)
        print(f"  Filtered to {len(filtered_messages)} training messages")
        
        # Update the conversation in all_conversations
        all_conversations[filename] = filtered_messages

print(f"\n=== FINAL CONVERSATION SUMMARY ===")
print(f"Total conversations: {len(all_conversations)}")
total_messages = sum(len(msgs) for msgs in all_conversations.values())
print(f"Total messages: {total_messages:,}")

# Show breakdown by source
whatsapp_count = len([f for f in all_conversations.keys() if not f.endswith('_telegram.txt')])
telegram_count = len([f for f in all_conversations.keys() if f.endswith('_telegram.txt')])
print(f"WhatsApp conversations: {whatsapp_count}")
print(f"Telegram conversations: {telegram_count}")

print("\n✅ All conversations processed and ready for CSV export!")


In [None]:
# Re-export all conversations including Telegram to CSV
print("=== RE-EXPORTING ALL CONVERSATIONS TO CSV ===")

# Update all_messages to include all processed conversations
all_messages = []
for messages in all_conversations.values():
    all_messages.extend(messages)

print(f"Total conversations to export: {len(all_conversations)}")
print(f"Total messages to export: {len(all_messages):,}")

# Detect language for all conversations (including Telegram)
print(f"\nDetecting languages for all conversations...")
conversation_languages = {}

for filename, messages in all_conversations.items():
    if messages:
        language = detect_conversation_language(messages)
        conversation_languages[filename] = language
        print(f"  {filename}: {language} ({len(messages)} messages)")
    else:
        conversation_languages[filename] = 'other'
        print(f"  {filename}: no messages")

# Save all conversations as CSV files
print(f"\nSaving all conversations to CSV...")
save_conversations_to_csv(
    conversations=all_conversations,
    conversation_languages=conversation_languages,
    output_dir='data/cleaned'
)

# Final summary
create_processing_summary(all_conversations, conversation_languages)

print("\n🎉 ALL CONVERSATIONS (WhatsApp + Telegram) EXPORTED SUCCESSFULLY!")
print("Check the 'data/cleaned' directory for all CSV files.")
