# Toxic Comment Classification

## Multi-Label Classification of Wikipedia Comments

This notebook implements machine learning and deep learning approaches for toxic comment classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import sys
sys.path.append('..')

from src.data.loader import FileBasedDataLoader
from src.data.preprocessor import TextPreprocessor

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Data

In [None]:
loader = FileBasedDataLoader(data_dir='../data')
X_train, y_train = loader.load_train_data()

print(f"Training samples: {len(X_train)}")
print(f"Label columns: {y_train.columns.tolist()}")
print(f"\nFirst few rows:")
display(X_train.head())
display(y_train.head())

## 2. Exploratory Data Analysis

### 2.1 Label Distribution

In [None]:
label_counts = y_train.sum().sort_values(ascending=True)

plt.figure(figsize=(10, 6))
label_counts.plot(kind='barh', color='coral')
plt.xlabel('Number of Comments')
plt.ylabel('Label')
plt.title('Distribution of Toxicity Labels in Training Data')
plt.tight_layout()
plt.savefig('../report/figures/label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nLabel Statistics:")
for label in label_counts.index:
    count = label_counts[label]
    percentage = (count / len(y_train)) * 100
    print(f"{label:15s}: {count:6d} ({percentage:5.2f}%)")

### 2.2 Multi-Label Statistics

In [None]:
labels_per_comment = y_train.sum(axis=1)

plt.figure(figsize=(10, 6))
labels_per_comment.value_counts().sort_index().plot(kind='bar', color='steelblue')
plt.xlabel('Number of Labels')
plt.ylabel('Number of Comments')
plt.title('Distribution of Label Count per Comment')
plt.tight_layout()
plt.savefig('../report/figures/multilabel_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nMulti-Label Statistics:")
for i in range(7):
    count = (labels_per_comment == i).sum()
    percentage = (count / len(y_train)) * 100
    print(f"{i} label(s): {count:6d} ({percentage:5.2f}%)")

### 2.3 Comment Length Analysis

In [None]:
comment_lengths = X_train['comment_text'].str.len()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(comment_lengths, bins=50, color='green', alpha=0.7, edgecolor='black')
plt.xlabel('Character Count')
plt.ylabel('Frequency')
plt.title('Distribution of Comment Lengths (Characters)')
plt.xlim(0, 2000)

word_counts = X_train['comment_text'].str.split().str.len()
plt.subplot(1, 2, 2)
plt.hist(word_counts, bins=50, color='purple', alpha=0.7, edgecolor='black')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Comment Lengths (Words)')
plt.xlim(0, 400)

plt.tight_layout()
plt.savefig('../report/figures/comment_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nComment Length Statistics (characters):")
print(comment_lengths.describe())
print(f"\nComment Length Statistics (words):")
print(word_counts.describe())

### 2.4 Label Correlation

In [None]:
correlation_matrix = y_train.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Toxicity Labels')
plt.tight_layout()
plt.savefig('../report/figures/label_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Text Preprocessing

In [None]:
preprocessor = TextPreprocessor(remove_stopwords=True, use_lemmatization=False)

sample_texts = [
    "I can't believe you're so stupid!",
    "This is a clean comment about the article.",
    "You're an idiot and shouldn't be allowed here!!!"
]

print("Sample Text Preprocessing:\n")
for text in sample_texts:
    cleaned = preprocessor.clean_text(text)
    print(f"Original: {text}")
    print(f"Cleaned:  {cleaned}")
    print()

## 4. Word Frequency Analysis

In [None]:
print("Processing text for word frequency analysis...")
cleaned_comments = preprocessor.preprocess_batch(X_train['comment_text'].head(10000).tolist())

all_words = []
for comment in cleaned_comments:
    all_words.extend(comment.split())

word_freq = Counter(all_words)
most_common = word_freq.most_common(20)

words, counts = zip(*most_common)
plt.figure(figsize=(12, 6))
plt.bar(words, counts, color='teal')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Frequent Words (after preprocessing)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../report/figures/word_frequency.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nTotal unique words: {len(word_freq)}")
print(f"Total words: {len(all_words)}")

## 5. Next Steps

- Implement baseline models (TF-IDF + Logistic Regression, Naive Bayes)
- Implement deep learning model (BiLSTM)
- K-fold cross-validation
- Model comparison and evaluation