# Hate Speech Detection - Data Exploration

This notebook explores the Hate Speech and Offensive Language Dataset.

**Project:** SSY340 - Unsupervised Learning for Hate Speech Detection  
**Group:** 13  
**Dataset:** Hate Speech and Offensive Language Dataset (~25,000 tweets)

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import DataLoader
from preprocessing import TextPreprocessor, get_text_stats

%matplotlib inline
sns.set_style('whitegrid')

print("Imports complete!")

## 1. Load Data

In [None]:
# Load dataset
loader = DataLoader('../data/raw')
df = loader.load_hate_speech_dataset()

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Get dataset information
loader.get_dataset_info(df)

## 2. Exploratory Data Analysis

In [None]:
# Label distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
df['class'].value_counts().plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Label Distribution')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['class'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.ylabel('')
plt.title('Label Distribution (%)')

plt.tight_layout()
plt.show()

print("\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass proportions:")
print(df['class'].value_counts(normalize=True))

In [None]:
# Text length analysis
text_column = 'tweet'  # Change if different

df['text_length'] = df[text_column].str.len()
df['word_count'] = df[text_column].str.split().str.len()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Character length distribution
axes[0, 0].hist(df['text_length'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Character Length')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Text Length')

# Word count distribution
axes[0, 1].hist(df['word_count'], bins=50, edgecolor='black', color='orange')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Word Count')

# Text length by class
for class_label in df['class'].unique():
    data = df[df['class'] == class_label]['text_length']
    axes[1, 0].hist(data, bins=30, alpha=0.5, label=f'Class {class_label}')
axes[1, 0].set_xlabel('Character Length')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Text Length by Class')
axes[1, 0].legend()

# Box plot of text length by class
df.boxplot(column='text_length', by='class', ax=axes[1, 1])
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Character Length')
axes[1, 1].set_title('Text Length Distribution by Class')

plt.suptitle('')
plt.tight_layout()
plt.show()

print("\nText statistics:")
print(df[['text_length', 'word_count']].describe())

In [None]:
# Sample tweets from each class
print("Sample tweets from each class:\n")
for class_label in sorted(df['class'].unique()):
    print(f"\n=== Class {class_label} ===")
    samples = df[df['class'] == class_label].sample(3, random_state=42)
    for idx, tweet in enumerate(samples[text_column].values, 1):
        print(f"{idx}. {tweet}")

## 3. Text Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(
    remove_stopwords=False,
    lowercase=True,
    remove_urls=True,
    remove_mentions=True,
    remove_hashtags=False
)

# Preprocess sample tweets
sample_tweets = df[text_column].sample(5, random_state=42).tolist()

print("Preprocessing examples:\n")
for i, tweet in enumerate(sample_tweets, 1):
    cleaned = preprocessor.clean_text(tweet)
    print(f"{i}. Original: {tweet}")
    print(f"   Cleaned:  {cleaned}\n")

In [None]:
# Preprocess entire dataset
df_processed = preprocessor.preprocess_dataframe(df, text_column, 'cleaned_text')

# Compare statistics before and after
print("Before preprocessing:")
stats_before = get_text_stats(df[text_column])
for k, v in stats_before.items():
    print(f"  {k}: {v:.2f}")

print("\nAfter preprocessing:")
stats_after = get_text_stats(df_processed['cleaned_text'])
for k, v in stats_after.items():
    print(f"  {k}: {v:.2f}")

## 4. Save Processed Data

In [None]:
# Save processed data
output_path = '../data/processed/processed_data.csv'
df_processed.to_csv(output_path, index=False)
print(f"Saved processed data to {output_path}")
print(f"Shape: {df_processed.shape}")

## Summary

This notebook explored the Hate Speech and Offensive Language Dataset, including:
- Dataset structure and label distribution
- Text length and word count statistics
- Sample tweets from each class
- Text preprocessing pipeline

Next steps:
1. Generate embeddings using SBERT models
2. Apply clustering algorithms (K-Means, DBSCAN)
3. Evaluate using ARI and other metrics