# Data Cleaning and Preprocessing

In [18]:
import os
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

# 1. Load the original TSV
df = pd.read_csv('./data/raw/hate_speech_dataset.tsv', sep='\t')
df.head()

Unnamed: 0,post,class
0,""" : jewish harvard professor noel ignatiev w...",implicit_hate
1,b.higher education is a part of european cult...,not_hate
2,"has a problem with "" the whites "" "" and "" "" ...",not_hate
3,is yasir qadhi a hate preacher for calling ch...,not_hate
4,"rt "" : how three million germans mass murder...",not_hate


In [19]:
# Define a text cleaning function
stopwords = ENGLISH_STOP_WORDS

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove "rt" as retweet marker
    text = re.sub(r"\brt\b", " ", text)
    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # Remove mentions and hashtags
    text = re.sub(r"[@#]\w+", " ", text)
    # Remove HTML entities
    text = re.sub(r"&\w+;", " ", text)
    # Remove non-letter characters, keep spaces and apostrophes
    text = re.sub(r"[^a-z\s']", " ", text)
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    # Tokenize on spaces
    tokens = text.split()
    # Remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stopwords and len(t) > 1]
    # Spell correction for each token
    # corrected_tokens = [spell.correction(t) or t for t in tokens]
    # Join back to string
    return " ".join(tokens)

df['cleaned_post'] = df['post'].apply(clean_text)

# Drop empty rows (only 30 rows)
df = df[df["cleaned_post"] != ""]

# Map class labels to numeric codes
mul_class_mapping = {
    'not_hate': 0,
    'implicit_hate': 1,
    'explicit_hate': 2
}
bi_class_mapping = {
    'not_hate': 0,
    'implicit_hate': 1,
    'explicit_hate': 1
}
df['bi_class'] = df['class'].map(bi_class_mapping)
df['mul_class'] = df['class'].map(mul_class_mapping)

# Keep only needed columns
cleaned_df = df[['cleaned_post', 'class', 'bi_class', 'mul_class']]

# Save the cleaned data
output_dir = './data/cleaned'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'hate_speech_dataset.tsv')
cleaned_df.to_csv(output_path, index=False)

cleaned_df.head()

Unnamed: 0,cleaned_post,class,bi_class,mul_class
0,jewish harvard professor noel ignatiev wants a...,implicit_hate,1,1
1,higher education european culture imported con...,not_hate,0,0
2,problem whites christians ahead free say,not_hate,0,0
3,yasir qadhi hate preacher calling christians f...,not_hate,0,0
4,million germans mass murdered destruction reich,not_hate,0,0


# Vectorization

## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the cleaned data
df = pd.read_csv("./data/cleaned/hate_speech_dataset.tsv")

# Text and labels
X_text = df["cleaned_post"].astype(str)
y = df["bi_class"].values   # Binary class labels 0/1

# Split train and test sets (for later model training)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,   # Stratified sampling by class
)

# Define TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 2),     # Use unigram + bigram
    min_df=5,               # Keep terms that appear in at least 5 documents
)

# 4. Fit on training set, then transform training and test sets
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

print("Training set TF-IDF shape:", X_train_tfidf.shape)
print("Test set TF-IDF shape:", X_test_tfidf.shape)

Training set TF-IDF shape: (17160, 5000)
Test set TF-IDF shape: (4290, 5000)


## Word Embeddings

In [9]:
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Load the cleaned data
df = pd.read_csv("./data/cleaned/hate_speech_dataset.tsv")

y_bi = df["bi_class"].values     # Binary class 0 1

# Split train/test for later modeling
X_text = df["cleaned_post"].astype(str)

X_train_text, X_test_text, y_train_bi, y_test_bi = train_test_split(
    X_text,
    y_bi,
    test_size=0.2,
    random_state=42,
    stratify=y_bi,
)

# Tokenization
sentences = [word_tokenize(row) for row in X_train_text]

# Train Word2Vec model
emb_dim = 100

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=emb_dim,
    window=5,
    min_count=5,   # At least appear in 5 sentences
    workers=4,
    sg=1           # 1 for skip-gram, 0 for CBOW
)

# Convert a sentence into an average word vector
def sentence_to_vec(sentence: str, model: Word2Vec, emb_dim: int) -> np.ndarray:
    tokens = word_tokenize(sentence)
    vecs = []
    for w in tokens:
        if w in model.wv:
            vecs.append(model.wv[w])
    if len(vecs) == 0:
        return np.zeros(emb_dim)
    return np.mean(vecs, axis=0)

# Construct embedding feature matrices
X_train_emb = np.vstack([
    sentence_to_vec(s, w2v_model, emb_dim)
    for s in X_train_text
])

X_test_emb = np.vstack([
    sentence_to_vec(s, w2v_model, emb_dim)
    for s in X_test_text
])

print("Training set embedding shape:", X_train_emb.shape)
print("Testing set embedding shape:", X_test_emb.shape)

Training set embedding shape: (17160, 100)
Testing set embedding shape: (4290, 100)
