In [1]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from typing import Dict, List, Any, Set
import warnings
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("=== UNIFIED ERROR DATASET CREATION ===")
print("Loading all model error cases from evaluation results...")
print()


=== UNIFIED ERROR DATASET CREATION ===
Loading all model error cases from evaluation results...



In [2]:
def load_all_error_cases(results_base_dir: str = 'results') -> pd.DataFrame:
    """
    Load all detailed results and filter for incorrect predictions only.
    Returns a DataFrame containing all error cases from all evaluations.
    """
    error_cases_list = []
    results_path = Path(results_base_dir)
    
    if not results_path.exists():
        print(f"Results directory {results_base_dir} not found!")
        return pd.DataFrame()
    
    print(f"Scanning {results_base_dir} for evaluation results...")
    
    # Iterate through dataset directories
    for dataset_dir in results_path.iterdir():
        if not dataset_dir.is_dir():
            continue
            
        dataset_name = dataset_dir.name
        print(f"Processing dataset: {dataset_name}")
        
        # Iterate through timestamp directories
        for timestamp_dir in dataset_dir.iterdir():
            if not timestamp_dir.is_dir():
                continue
                
            # Load detailed results and evaluation info
            detailed_file = timestamp_dir / 'detailed_results.csv'
            info_file = timestamp_dir / 'evaluation_info.json'
            
            if detailed_file.exists() and info_file.exists():
                try:
                    # Load evaluation info for metadata
                    with open(info_file, 'r') as f:
                        info_data = json.load(f)
                    
                    # Load detailed results
                    detailed_df = pd.read_csv(detailed_file)
                    
                    # Filter for incorrect predictions only
                    error_cases = detailed_df[detailed_df['is_correct'] == False].copy()
                    
                    if len(error_cases) > 0:
                        # Add metadata to each error case
                        error_cases['dataset_name'] = dataset_name
                        error_cases['model'] = info_data['evaluation_config']['model']
                        error_cases['provider'] = info_data['evaluation_config']['provider']
                        error_cases['sample_size'] = info_data['evaluation_config']['sample_size']
                        error_cases['timestamp'] = info_data['timestamp']
                        error_cases['run_id'] = f"{dataset_name}_{info_data['timestamp']}"
                        
                        error_cases_list.append(error_cases)
                        print(f"  Found {len(error_cases)} error cases in {timestamp_dir.name}")
                    
                except Exception as e:
                    print(f"  Error loading {timestamp_dir}: {e}")
                    continue
    
    if error_cases_list:
        combined_errors = pd.concat(error_cases_list, ignore_index=True)
        print(f"\n✅ Total error cases loaded: {len(combined_errors)}")
        print(f"📊 Error cases by dataset:")
        for dataset, count in combined_errors['dataset_name'].value_counts().items():
            print(f"  - {dataset}: {count} errors")
        print(f"🤖 Error cases by model:")
        for model, count in combined_errors['model'].value_counts().items():
            print(f"  - {model}: {count} errors")
        return combined_errors
    else:
        print("❌ No error cases found!")
        return pd.DataFrame()

# Load all error cases
all_errors = load_all_error_cases()

if len(all_errors) > 0:
    print(f"\n📋 Available columns: {all_errors.columns.tolist()}")
    print(f"📏 Dataset shape: {all_errors.shape}")
else:
    print("No error data to process")


Scanning results for evaluation results...
Processing dataset: annotated
Processing dataset: phishing_sms_dataset
  Found 6 error cases in 20250613_230316
  Found 590 error cases in 20250616_040647
  Found 4 error cases in 20250616_031241
  Found 1 error cases in 20250613_230006
  Found 6 error cases in 20250614_011431
  Found 627 error cases in 20250614_030314
  Found 4 error cases in 20250614_011218
  Found 15 error cases in 20250616_030752
  Found 8 error cases in 20250613_225308
Processing dataset: unified_phishing_email_dataset
  Found 9 error cases in 20250613_111708
  Found 99 error cases in 20250613_032254
  Found 5 error cases in 20250613_030731
  Found 5 error cases in 20250613_024907
  Found 5 error cases in 20250613_024938
  Found 5 error cases in 20250613_025015
  Found 1 error cases in 20250611_210318
  Found 13 error cases in 20250613_035449
  Found 5 error cases in 20250613_024820
  Found 4 error cases in 20250613_035701
  Found 46 error cases in 20250613_224943
  Found

In [3]:
def process_content_by_dataset_type(error_df: pd.DataFrame) -> pd.DataFrame:
    """
    Process content based on dataset type:
    - Email datasets: concatenate subject + body → content
    - SMS datasets: rename message → content
    Returns DataFrame with standardized 'content' column
    """
    if len(error_df) == 0:
        return pd.DataFrame()
    
    processed_records = []
    
    # Get original content columns (those starting with 'original_')
    original_columns = [col for col in error_df.columns if col.startswith('original_')]
    print(f"Found original content columns: {original_columns}")
    
    for dataset_name in error_df['dataset_name'].unique():
        dataset_errors = error_df[error_df['dataset_name'] == dataset_name].copy()
        print(f"\n📧 Processing {dataset_name} ({len(dataset_errors)} error cases)")
        
        # Determine dataset type based on dataset name (parent directory)
        is_email_dataset = False
        is_sms_dataset = False
        
        # Simple categorization based on dataset name
        if 'email' in dataset_name.lower():
            is_email_dataset = True
            print(f"  Detected as EMAIL dataset (based on dataset name)")
        elif 'sms' in dataset_name.lower():
            is_sms_dataset = True
            print(f"  Detected as SMS dataset (based on dataset name)")
        else:
            # Fallback: check columns to determine best processing approach
            if 'original_subject' in dataset_errors.columns and 'original_body' in dataset_errors.columns:
                is_email_dataset = True
                print(f"  Detected as EMAIL dataset (has subject & body columns)")
            elif 'original_message' in dataset_errors.columns or 'original_text' in dataset_errors.columns:
                is_sms_dataset = True
                print(f"  Detected as SMS dataset (has message/text column)")
            else:
                print(f"  ⚠️ Warning: Unable to determine dataset type. Available columns: {original_columns}")
                # Try to find any text-like columns
                text_columns = [col for col in original_columns if any(text_word in col.lower() 
                               for text_word in ['text', 'content', 'message', 'body', 'subject'])]
                if text_columns:
                    print(f"  Using available text columns: {text_columns}")
        
        # Process each error case
        for idx, row in dataset_errors.iterrows():
            # Determine source type
            if is_email_dataset:
                source_type = 'email'
            elif is_sms_dataset:
                source_type = 'sms'
            else:
                source_type = 'other'
            
            record = {
                'dataset_name': dataset_name,
                'model': row['model'],
                'run_id': row['run_id'],
                'actual_label': row['actual_label'],
                'predicted_label': row['predicted_label'],
                'error_type': 'False Negative' if (row['actual_label'] == 1 and row['predicted_label'] == 0) else 'False Positive',
                'llm_reason': row.get('llm_reason', ''),
                'content': '',
                'source': source_type,
                'original_id': row.get('original_id', '') or row.get('id', '')
            }
            
            # Create content based on dataset type
            if is_email_dataset:
                # Email: concatenate subject + body
                subject = str(row.get('original_subject', '')).strip()
                body = str(row.get('original_body', '')).strip()
                
                # Handle NaN values
                if subject == 'nan' or pd.isna(row.get('original_subject')):
                    subject = ''
                if body == 'nan' or pd.isna(row.get('original_body')):
                    body = ''
                
                # Concatenate with proper formatting
                if subject and body:
                    record['content'] = f"Subject: {subject}\n\nBody: {body}"
                elif subject:
                    record['content'] = f"Subject: {subject}"
                elif body:
                    record['content'] = f"Body: {body}"
                else:
                    record['content'] = ''
            
            elif is_sms_dataset:
                # SMS: use message or text column
                if 'original_message' in row:
                    message = str(row.get('original_message', '')).strip()
                elif 'original_text' in row:
                    message = str(row.get('original_text', '')).strip()
                else:
                    message = ''
                
                # Handle NaN values
                if message == 'nan' or pd.isna(message):
                    message = ''
                
                record['content'] = message
            
            else:
                # Fallback: concatenate all available text columns
                content_parts = []
                for col in original_columns:
                    if col != 'original_id':
                        value = str(row.get(col, '')).strip()
                        if value and value != 'nan' and not pd.isna(value):
                            col_name = col.replace('original_', '').title()
                            content_parts.append(f"{col_name}: {value}")
                
                record['content'] = '\n\n'.join(content_parts)
            
            # Only add if content is not empty
            if record['content'].strip():
                processed_records.append(record)
    
    if processed_records:
        processed_df = pd.DataFrame(processed_records)
        print(f"\n✅ Successfully processed {len(processed_df)} error cases with content")
        return processed_df
    else:
        print(f"\n❌ No valid content found in error cases")
        return pd.DataFrame()

# Process content for all error cases
if len(all_errors) > 0:
    processed_errors = process_content_by_dataset_type(all_errors)
    
    if len(processed_errors) > 0:
        print(f"\n📊 Content processing summary:")
        print(f"  Total records with content: {len(processed_errors)}")
        print(f"  Average content length: {processed_errors['content'].str.len().mean():.0f} characters")
        print(f"  Content length range: {processed_errors['content'].str.len().min()} - {processed_errors['content'].str.len().max()}")
        
        # Show sample of processed content
        print(f"\n📄 Sample processed content:")
        for i, (_, row) in enumerate(processed_errors.head(2).iterrows()):
            print(f"\nExample {i+1} ({row['dataset_name']}):")
            print(f"Label: {'SCAM' if row['actual_label'] == 1 else 'LEGITIMATE'}")
            print(f"Error Type: {row['error_type']}")
            print(f"Source: {row['source']}")
            content_preview = row['content'][:200] + "..." if len(row['content']) > 200 else row['content']
            print(f"Content: {content_preview}")
            print("-" * 50)
    else:
        print("❌ No content could be processed")
else:
    print("❌ No error data available to process")


Found original content columns: ['original_message', 'original_subject', 'original_body', 'original_source']

📧 Processing phishing_sms_dataset (1261 error cases)
  Detected as SMS dataset (based on dataset name)

📧 Processing unified_phishing_email_dataset (2925 error cases)
  Detected as EMAIL dataset (based on dataset name)

✅ Successfully processed 4186 error cases with content

📊 Content processing summary:
  Total records with content: 4186
  Average content length: 2622 characters
  Content length range: 3 - 112031

📄 Sample processed content:

Example 1 (phishing_sms_dataset):
Label: SCAM
Error Type: False Negative
Source: sms
Content: SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV
--------------------------------------------------

Example 2 (phishing_sms_dataset):
Label: LEGITIMATE
Error Type: False Positive
Source: sms
Content: R Ì_ going 4 today's meeting?
---------------------------------------------

In [4]:
def deduplicate_error_cases(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove duplicate data points based on content.
    Keep the first occurrence and track which models/runs had errors on each data point.
    """
    if len(processed_df) == 0:
        return pd.DataFrame()
    
    print(f"🔍 Deduplicating {len(processed_df)} error cases...")
    
    # Find duplicates based on content and label (case-insensitive content comparison)
    processed_df['content_normalized'] = processed_df['content'].str.lower().str.strip()
    
    # Group by normalized content and label to find duplicates
    duplicates_info = processed_df.groupby(['content_normalized', 'actual_label']).agg({
        'dataset_name': lambda x: ', '.join(x.unique()),
        'model': lambda x: ', '.join(x.unique()),
        'run_id': lambda x: ', '.join(x.unique()),
        'error_type': lambda x: ', '.join(x.unique()),
        'source': 'first',  # Keep the source type
        'original_id': 'first',
        'content': 'first',  # Keep original content (not normalized)
        'llm_reason': lambda x: ' | '.join([r for r in x.unique() if r and str(r).strip()])
    }).reset_index()
    
    # Count how many times each content appears
    duplicate_counts = processed_df.groupby(['content_normalized', 'actual_label']).size().reset_index(name='error_count')
    
    # Merge back the counts
    deduplicated = duplicates_info.merge(duplicate_counts, on=['content_normalized', 'actual_label'])
    
    # Drop the normalized content column (we don't need it in final output)
    deduplicated = deduplicated.drop('content_normalized', axis=1)
    
    # Rename columns for clarity
    deduplicated = deduplicated.rename(columns={
        'actual_label': 'label',
        'dataset_name': 'source_datasets',
        'model': 'failed_models',
        'run_id': 'source_runs',
        'error_type': 'error_types'
    })
    
    print(f"✅ Deduplication complete!")
    print(f"  Before: {len(processed_df)} error cases")
    print(f"  After: {len(deduplicated)} unique data points")
    print(f"  Duplicates removed: {len(processed_df) - len(deduplicated)}")
    
    # Show duplicate statistics
    duplicate_stats = deduplicated['error_count'].value_counts().sort_index()
    print(f"\\n📊 Duplicate statistics:")
    for count, freq in duplicate_stats.items():
        if count == 1:
            print(f"  {freq} data points appeared in 1 error case (unique)")
        else:
            print(f"  {freq} data points appeared in {count} error cases (duplicated)")
    
    # Show most problematic data points (those that multiple models got wrong)
    most_problematic = deduplicated.nlargest(5, 'error_count')
    if len(most_problematic) > 0:
        print(f"\\n🚨 Most problematic data points (failed by multiple models/runs):")
        for i, (_, row) in enumerate(most_problematic.iterrows(), 1):
            print(f"\\n{i}. Failed {row['error_count']} times")
            print(f"   Label: {'SCAM' if row['label'] == 1 else 'LEGITIMATE'}")
            print(f"   Failed models: {row['failed_models']}")
            print(f"   Content preview: {row['content'][:100]}...")
    
    return deduplicated

# Deduplicate the processed error cases
if len(processed_errors) > 0:
    deduplicated_errors = deduplicate_error_cases(processed_errors)
    
    if len(deduplicated_errors) > 0:
        print(f"\\n📋 Deduplicated dataset shape: {deduplicated_errors.shape}")
        print(f"📋 Columns: {deduplicated_errors.columns.tolist()}")
    else:
        print("❌ No data left after deduplication")
else:
    print("❌ No processed errors to deduplicate")


🔍 Deduplicating 4186 error cases...
✅ Deduplication complete!
  Before: 4186 error cases
  After: 2540 unique data points
  Duplicates removed: 1646
\n📊 Duplicate statistics:
  1451 data points appeared in 1 error case (unique)
  915 data points appeared in 2 error cases (duplicated)
  61 data points appeared in 3 error cases (duplicated)
  37 data points appeared in 4 error cases (duplicated)
  16 data points appeared in 5 error cases (duplicated)
  34 data points appeared in 6 error cases (duplicated)
  11 data points appeared in 7 error cases (duplicated)
  2 data points appeared in 8 error cases (duplicated)
  3 data points appeared in 9 error cases (duplicated)
  2 data points appeared in 11 error cases (duplicated)
  2 data points appeared in 13 error cases (duplicated)
  1 data points appeared in 15 error cases (duplicated)
  3 data points appeared in 16 error cases (duplicated)
  1 data points appeared in 20 error cases (duplicated)
  1 data points appeared in 39 error cases (d

In [5]:
def create_final_unified_dataset(deduplicated_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create the final unified dataset with required columns: id, content, label, source
    """
    if len(deduplicated_df) == 0:
        return pd.DataFrame()
    
    print(f"🎯 Creating final unified dataset with required columns...")
    
    # Create the final dataset with required columns
    final_dataset = pd.DataFrame()
    
    # Generate unique IDs for each data point
    final_dataset['id'] = range(1, len(deduplicated_df) + 1)
    
    # Add content (already processed and deduplicated)
    final_dataset['content'] = deduplicated_df['content'].values
    
    # Add label (0 = legitimate, 1 = scam)
    final_dataset['label'] = deduplicated_df['label'].values
    
    # Add source (email, sms, or other)
    final_dataset['source'] = deduplicated_df['source'].values
    
    # Validate the final dataset
    print(f"✅ Final dataset created successfully!")
    print(f"📏 Shape: {final_dataset.shape}")
    print(f"📋 Columns: {final_dataset.columns.tolist()}")
    
    # Dataset statistics
    print(f"\\n📊 Dataset Statistics:")
    label_counts = final_dataset['label'].value_counts().sort_index()
    total_records = len(final_dataset)
    
    for label, count in label_counts.items():
        label_name = "LEGITIMATE" if label == 0 else "SCAM"
        percentage = (count / total_records) * 100
        print(f"  {label_name} (label={label}): {count} records ({percentage:.1f}%)")
    
    # Source distribution statistics
    print(f"\\n📊 Source Distribution:")
    source_counts = final_dataset['source'].value_counts()
    for source, count in source_counts.items():
        percentage = (count / total_records) * 100
        print(f"  {source}: {count} records ({percentage:.1f}%)")
    
    # Content length statistics
    content_lengths = final_dataset['content'].str.len()
    print(f"\\n📝 Content Length Statistics:")
    print(f"  Average length: {content_lengths.mean():.0f} characters")
    print(f"  Median length: {content_lengths.median():.0f} characters")
    print(f"  Min length: {content_lengths.min()} characters")
    print(f"  Max length: {content_lengths.max()} characters")
    
    # Check for any empty content
    empty_content = final_dataset[final_dataset['content'].str.strip() == '']
    if len(empty_content) > 0:
        print(f"⚠️ Warning: Found {len(empty_content)} records with empty content")
    else:
        print(f"✅ All records have non-empty content")
    
    return final_dataset

# Create the final unified dataset
if len(deduplicated_errors) > 0:
    unified_error_dataset = create_final_unified_dataset(deduplicated_errors)
    
    if len(unified_error_dataset) > 0:
        print(f"\\n🎉 SUCCESS: Unified error dataset created!")
        print(f"📋 Final dataset preview:")
        display(unified_error_dataset.head(10))
    else:
        print("❌ Failed to create final dataset")
else:
    print("❌ No deduplicated data available for final dataset creation")


🎯 Creating final unified dataset with required columns...
✅ Final dataset created successfully!
📏 Shape: (2540, 4)
📋 Columns: ['id', 'content', 'label', 'source']
\n📊 Dataset Statistics:
  LEGITIMATE (label=0): 1056 records (41.6%)
  SCAM (label=1): 1484 records (58.4%)
\n📊 Source Distribution:
  email: 1701 records (67.0%)
  sms: 839 records (33.0%)
\n📝 Content Length Statistics:
  Average length: 2060 characters
  Median length: 356 characters
  Min length: 3 characters
  Max length: 112031 characters
✅ All records have non-empty content
\n🎉 SUCCESS: Unified error dataset created!
📋 Final dataset preview:


Unnamed: 0,id,content,label,source
0,1,&lt;#&gt; w jetton ave if you forgot,0,sms
1,2,&lt;#&gt; %of pple marry with their lovers... becz they hav gud undrstndng dat avoids problems. ...,0,sms
2,3,"'Wnevr i wana fal in luv vth my books, My bed fals in luv vth me..!'' . Yen madodu, nav pretsorg...",0,sms
3,4,(You didn't hear it from me),0,sms
4,5,", ow u dey.i paid 60,400thousad.i told u would call .",0,sms
5,6,08714712388 between 10am-7pm Cost 10p,1,sms
6,7,"0A$NETWORKS allow companies to bill for SMS, so they are responsible for their \suppliers\""",1,sms
7,8,1's finish meeting call me.,0,sms
8,9,1's reach home call me.,0,sms
9,10,10 min later k...,0,sms


In [6]:
# Create additional metadata dataset for analysis
if len(deduplicated_errors) > 0 and len(unified_error_dataset) > 0:
    print("📋 Creating additional metadata for analysis...")
    
    # Create extended dataset with metadata (for optional analysis)
    extended_dataset = unified_error_dataset.copy()
    
    # Add metadata from the deduplicated errors
    extended_dataset['source_datasets'] = deduplicated_errors['source_datasets'].values
    extended_dataset['failed_models'] = deduplicated_errors['failed_models'].values
    extended_dataset['error_count'] = deduplicated_errors['error_count'].values
    extended_dataset['error_types'] = deduplicated_errors['error_types'].values
    extended_dataset['llm_reasoning'] = deduplicated_errors['llm_reason'].values
    
    print(f"✅ Extended dataset created with metadata")
    print(f"📏 Extended dataset shape: {extended_dataset.shape}")
    print(f"📋 Extended columns: {extended_dataset.columns.tolist()}")
    
    # Show analysis of error patterns
    print(f"\\n🔍 ERROR PATTERN ANALYSIS:")
    print("-" * 50)
    
    # Most frequently failed data points
    frequent_errors = extended_dataset.nlargest(5, 'error_count')
    print(f"\\n🚨 Data points that failed most frequently:")
    for i, (_, row) in enumerate(frequent_errors.iterrows(), 1):
        print(f"\\n{i}. ID {row['id']} - Failed {row['error_count']} times")
        print(f"   Label: {'SCAM' if row['label'] == 1 else 'LEGITIMATE'}")
        print(f"   Models that failed: {row['failed_models']}")
        print(f"   Content preview: {row['content'][:100]}...")
    
    # Error distribution by original dataset
    print(f"\\n📊 Error distribution by source dataset:")
    for dataset in deduplicated_errors['source_datasets'].unique():
        # Count how many errors came from each dataset
        dataset_errors = extended_dataset[extended_dataset['source_datasets'].str.contains(dataset)]
        print(f"  {dataset}: {len(dataset_errors)} unique error data points")
    
    # Error type distribution
    print(f"\\n📊 Error type distribution:")
    error_type_counts = {}
    for error_types in extended_dataset['error_types'].values:
        for error_type in error_types.split(', '):
            error_type_counts[error_type] = error_type_counts.get(error_type, 0) + 1
    
    for error_type, count in error_type_counts.items():
        print(f"  {error_type}: {count} data points")
    
    # Content length analysis by label
    print(f"\\n📝 Content length analysis by label:")
    for label in [0, 1]:
        label_data = unified_error_dataset[unified_error_dataset['label'] == label]
        label_name = "LEGITIMATE" if label == 0 else "SCAM"
        if len(label_data) > 0:
            avg_length = label_data['content'].str.len().mean()
            print(f"  {label_name}: Average {avg_length:.0f} characters")
    
    print(f"\\n📋 Sample of extended dataset:")
    display(extended_dataset[['id', 'content', 'label', 'error_count', 'failed_models']].head())

else:
    print("❌ No data available for extended analysis")


📋 Creating additional metadata for analysis...
✅ Extended dataset created with metadata
📏 Extended dataset shape: (2540, 9)
📋 Extended columns: ['id', 'content', 'label', 'source', 'source_datasets', 'failed_models', 'error_count', 'error_types', 'llm_reasoning']
\n🔍 ERROR PATTERN ANALYSIS:
--------------------------------------------------
\n🚨 Data points that failed most frequently:
\n1. ID 1080 - Failed 39 times
   Label: SCAM
   Models that failed: gpt-4.1, Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-1.5B-Instruct, qwen/deepseek-r1-0528-qwen3-8b, unsloth/qwen3-30b-a3b, qwen3-235b-a22b-128k, unsloth/gemma-3-4b-it, qwen/qwen3-8b, qwen/qwen3-235b-a22b, Deepseek-r1-0528-distill-qwen3-32b-preview0-qat, unsloth/qwen3-4b, llama-4-maverick-17b-128e-instruct, llama-4-scout-17b-16e-instruct, unsloth/qwen3-32b, gemini-2.5-flash-preview-05-20, gemma-3-27b-it, qwen3-0.6b, deepseek-r1-0528-distill-qwen3-32b-preview0-qat, gemma3-1b-it
   Content preview: Subject: CNN.com Daily Top 10

Body: >+=+=+=+=+=+=+=+

Unnamed: 0,id,content,label,error_count,failed_models
0,1,&lt;#&gt; w jetton ave if you forgot,0,1,Deepseek-r1-0528-distill-qwen3-32b-preview0-qat
1,2,&lt;#&gt; %of pple marry with their lovers... becz they hav gud undrstndng dat avoids problems. ...,0,2,"unsloth/qwen3-30b-a3b, Deepseek-r1-0528-distill-qwen3-32b-preview0-qat"
2,3,"'Wnevr i wana fal in luv vth my books, My bed fals in luv vth me..!'' . Yen madodu, nav pretsorg...",0,1,unsloth/qwen3-30b-a3b
3,4,(You didn't hear it from me),0,1,unsloth/qwen3-30b-a3b
4,5,", ow u dey.i paid 60,400thousad.i told u would call .",0,2,"unsloth/qwen3-30b-a3b, Deepseek-r1-0528-distill-qwen3-32b-preview0-qat"


In [7]:
# Save the datasets
if len(unified_error_dataset) > 0:
    print("💾 Saving unified error dataset...")
    
    # Create output directory
    output_dir = Path('unified_error_dataset')
    output_dir.mkdir(exist_ok=True)
    
    # Save the main dataset (id, content, label only)
    main_file = output_dir / 'unified_error_dataset.csv'
    unified_error_dataset.to_csv(main_file, index=False)
    print(f"✅ Main dataset saved: {main_file}")
    print(f"   Columns: {unified_error_dataset.columns.tolist()}")
    print(f"   Shape: {unified_error_dataset.shape}")
    
    # Save the extended dataset with metadata (if available)
    if 'extended_dataset' in locals() and len(extended_dataset) > 0:
        extended_file = output_dir / 'unified_error_dataset_with_metadata.csv'
        extended_dataset.to_csv(extended_file, index=False)
        print(f"✅ Extended dataset saved: {extended_file}")
        print(f"   Columns: {extended_dataset.columns.tolist()}")
        print(f"   Shape: {extended_dataset.shape}")
    
    # Save dataset information
    info_file = output_dir / 'dataset_info.json'
    dataset_info = {
        'creation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
        'total_records': len(unified_error_dataset),
        'label_distribution': {
            'legitimate': int(unified_error_dataset[unified_error_dataset['label'] == 0].shape[0]),
            'scam': int(unified_error_dataset[unified_error_dataset['label'] == 1].shape[0])
        },
        'content_length_stats': {
            'mean': float(unified_error_dataset['content'].str.len().mean()),
            'median': float(unified_error_dataset['content'].str.len().median()),
            'min': int(unified_error_dataset['content'].str.len().min()),
            'max': int(unified_error_dataset['content'].str.len().max())
        },
        'source_info': {
            'total_original_errors': len(all_errors) if len(all_errors) > 0 else 0,
            'after_processing': len(processed_errors) if len(processed_errors) > 0 else 0,
            'after_deduplication': len(deduplicated_errors) if len(deduplicated_errors) > 0 else 0,
            'duplicates_removed': len(processed_errors) - len(deduplicated_errors) if len(processed_errors) > 0 and len(deduplicated_errors) > 0 else 0
        },
        'columns': {
            'main_dataset': unified_error_dataset.columns.tolist(),
            'extended_dataset': extended_dataset.columns.tolist() if 'extended_dataset' in locals() else []
        }
    }
    
    with open(info_file, 'w') as f:
        json.dump(dataset_info, f, indent=2)
    print(f"✅ Dataset info saved: {info_file}")


💾 Saving unified error dataset...
✅ Main dataset saved: unified_error_dataset/unified_error_dataset.csv
   Columns: ['id', 'content', 'label', 'source']
   Shape: (2540, 4)
✅ Extended dataset saved: unified_error_dataset/unified_error_dataset_with_metadata.csv
   Columns: ['id', 'content', 'label', 'source', 'source_datasets', 'failed_models', 'error_count', 'error_types', 'llm_reasoning']
   Shape: (2540, 9)
✅ Dataset info saved: unified_error_dataset/dataset_info.json
