# Misinformation Classifier - Exploratory Analysis

This notebook provides exploratory analysis and examples for the misinformation classifier.

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from dataclasses import dataclass
from typing import List

@dataclass
class Config:
    model_name: str = "distilbert-base-uncased"
    num_labels: int = 5
    max_length: int = 128
    batch_size: int = 16
    learning_rate: float = 2e-5
    num_epochs: int = 3
    warmup_steps: int = 100
    weight_decay: float = 0.01
    train_split: float = 0.8
    val_split: float = 0.1
    test_split: float = 0.1
    data_dir: str = "data"
    results_dir: str = "results"
    model_save_path: str = "results/best_model"
    label_names = None
    
    def __post_init__(self):
        if self.label_names is None:
            self.label_names = [
                "central_route_present",
                "peripheral_route_present", 
                "naturalness_bias",
                "availability_bias",
                "illusory_correlation"
            ]

def load_data_simple(file_path, label_names):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    texts = [item['text'] for item in data]
    
    # Map framework fields to label names
    field_mapping = {
        'framework1_feature1': 'central_route_present',
        'framework1_feature2': 'peripheral_route_present',
        'framework2_feature1': 'naturalness_bias',
        'framework2_feature2': 'availability_bias',
        'framework2_feature3': 'illusory_correlation'
    }
    
    labels = []
    for item in data:
        label_row = []
        for field, label_name in field_mapping.items():
            label_row.append(item.get(field, 0))
        labels.append(label_row)
    
    return texts, labels

config = Config()
print("Setup complete!")

ModuleNotFoundError: No module named 'pandas'

## Load and Explore Data

In [None]:
# Load sample data
texts, labels = load_data_simple('../data/raw/sample_data.json', config.label_names)

print(f"Dataset size: {len(texts)}")
print(f"Labels: {config.label_names}")
print(f"\nFirst few examples:")
for i in range(3):
    print(f"Text: {texts[i]}")
    print(f"Labels: {labels[i]}")
    print()

## Label Distribution Analysis

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame({
    'text': texts,
    **{label: [row[i] for row in labels] for i, label in enumerate(config.label_names)}
})

# Label distribution
label_counts = df[config.label_names].sum()
print("Label distribution:")
for label, count in label_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{label:25}: {count:3d} ({percentage:5.1f}%)")

In [None]:
# Visualize label distribution
plt.figure(figsize=(12, 6))
label_counts.plot(kind='bar')
plt.title('Distribution of Psychological Mechanism Labels')
plt.xlabel('Labels')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Length Analysis

In [None]:
# Text length statistics
text_lengths = [len(text.split()) for text in texts]

print(f"Text length statistics (words):")
print(f"Mean: {np.mean(text_lengths):.1f}")
print(f"Median: {np.median(text_lengths):.1f}")
print(f"Min: {np.min(text_lengths)}")
print(f"Max: {np.max(text_lengths)}")

plt.figure(figsize=(10, 4))
plt.hist(text_lengths, bins=20, alpha=0.7)
plt.title('Distribution of Text Lengths (Words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()