In [1]:
import pandas as pd
import re
import string
from collections import Counter
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Corpus

In [3]:
classes = ["Hate", "Normal", "Offensive"]
dfs = []

for cls in classes:
    for i in range(1, 5):  # files 1 to 4
        file_name = f"{cls}_Speeches_{i}.csv"
        tmp = pd.read_csv(file_name, usecols=["created_at", "full_text"])
        tmp["class"] = cls
        dfs.append(tmp)

df = pd.concat(dfs, ignore_index=True)
len(df[df["class"] == "Hate"])

966

### Case 1: Chi-Squared
For this case we will do some preprocessing for better results

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import numpy as np
import spacy
import re

nlp = spacy.load("en_core_web_sm")


def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text.lower())
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if 
                     (token.pos_ == "NOUN" or token.pos_ == "ADJ") and 
                     not token.is_stop and 
                     len(token) > 2 and
                     not re.search(r"\d", token.text)]) # The token does not contain digits


chi2_results = {}

texts = df["full_text"].astype(str).tolist()
labels = (df["class"] == "Hate").astype(int).values  # 1 = Hate, 0 = Non-Hate

# Vectorize with adaptive min_df
min_df_val = min(5, max(1, len(df)//10))
count_vec = CountVectorizer(min_df=min_df_val, ngram_range=(1,1))
X_count = count_vec.fit_transform(texts)


vocab = np.array(count_vec.get_feature_names_out())

# Chi² test
chi2_scores, pvals = chi2(X_count, labels)

# Compute per-class counts
counts_hate = np.asarray(X_count[labels == 1].sum(axis=0)).ravel()
counts_nonhate = np.asarray(X_count[labels == 0].sum(axis=0)).ravel()
preferred_class = np.where(counts_hate > counts_nonhate, "Hate", "Non-Hate")

# Build dataframe
chi2_df = pd.DataFrame({
    "word": vocab,
    "chi2": chi2_scores,
    "pval": pvals,
    "counts_hate": counts_hate,
    "counts_nonhate": counts_nonhate,
    "preferred_class": preferred_class
}).sort_values("chi2", ascending=False)

Results

In [28]:
chi2_df[chi2_df["preferred_class"] == "Hate"][21:40]

Unnamed: 0,word,chi2,pval,counts_hate,counts_nonhate,preferred_class
1748,seoul,30.362319,3.584235e-08,14,0,Hate
1255,mass,30.362319,3.584235e-08,14,0,Hate
1849,started,28.261857,1.059636e-07,23,7,Hate
483,coz,28.193582,1.09768e-07,13,0,Hate
1367,negativity,27.391171,1.661883e-07,16,2,Hate
266,biu,26.445745,2.710449e-07,17,3,Hate
490,crimes,26.024845,3.370519e-07,12,0,Hate
1583,qrt,26.024845,3.370519e-07,12,0,Hate
1082,jantarmantar,26.024845,3.370519e-07,12,0,Hate
1865,struggle,26.024845,3.370519e-07,12,0,Hate


### Case 2: TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',      # remove common words (optional)
    ngram_range=(1,1),        # unigrams + bigrams
    #min_df=5                   # ignore very rare terms
)


# Computing TF-IDF Matrix
X = vectorizer.fit_transform(df["full_text"])
feature_names = np.array(vectorizer.get_feature_names_out())

hate_idx = df["class"] == "Hate"
nonhate_idx = df["class"] != "Hate"

# Computing mean TF-IDF for each class
mean_hate = X[hate_idx].mean(axis=0).A1
mean_nonhate = X[nonhate_idx].mean(axis=0).A1

diff = mean_hate - mean_nonhate

# Top words
top_n = 60
top_idx = np.argsort(diff)[-top_n:]

top_words = pd.DataFrame({
    "word": feature_names[top_idx],
    "tfidf_diff": diff[top_idx],
    "mean_hate": mean_hate[top_idx],
    "mean_nonhate": mean_nonhate[top_idx]
}).sort_values("tfidf_diff", ascending=False)

In [29]:
top_words

Unnamed: 0,word,tfidf_diff,mean_hate,mean_nonhate
59,hate,0.090646,0.096393,0.005746
58,https,0.015005,0.048358,0.033353
57,love,0.012441,0.017264,0.004822
56,hatespeech,0.0104,0.0104,0.0
55,india,0.006593,0.007157,0.000564
54,video,0.005677,0.007921,0.002244
53,reading,0.005666,0.006076,0.00041
52,hindu,0.005576,0.00573,0.000154
51,delhi,0.005504,0.005504,0.0
50,spreading,0.005001,0.005826,0.000826
