In [1]:
import pandas as pd
import re
import string
from collections import Counter
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Corpus

In [22]:
data_path = "data.csv"
df = pd.read_csv(data_path)

# Force parse CreateDate with timezone awareness (UTC)
df["year"] = pd.to_datetime(df["CreateDate"], errors="coerce", utc=True).dt.year

# Drop rows with invalid dates (if any)
df = df.dropna(subset=["CreateDate"]).reset_index(drop=True)

def assign_class_simple(row):
    if (row.get("Biased") + row.get("Calling_Out")) > 0:
        return "Hate"

    return "Non-Hate"

df["class"] = df.apply(assign_class_simple, axis=1)

# Some tweets are duplicated as they target multiple groups
df = df.drop_duplicates(subset="TweetID", keep="first").copy()
df = df.drop_duplicates(subset="Text", keep="first").copy()

### Case 1: Chi-Squared
For this case we will do some preprocessing for better results

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import numpy as np
import spacy
import re

nlp = spacy.load("en_core_web_sm")


def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text.lower())
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if 
                     (token.pos_ == "NOUN" or token.pos_ == "ADJ") and 
                     not token.is_stop and 
                     len(token) > 2 and
                     not re.search(r"\d", token.text)]) # The token does not contain digits


chi2_results = {}

for year, df_y in df.groupby("year"):
    texts = df_y["Text"].astype(str).tolist()
    labels = (df_y["class"] == "Hate").astype(int).values  # 1 = Hate, 0 = Non-Hate


    # Vectorize with adaptive min_df
    min_df_val = min(5, max(1, len(df_y)//10))
    count_vec = CountVectorizer(min_df=min_df_val, ngram_range=(1,1))
    X_count = count_vec.fit_transform(texts)

    vocab = np.array(count_vec.get_feature_names_out())

    # Chi² test
    chi2_scores, pvals = chi2(X_count, labels)

    # Compute per-class counts
    counts_hate = np.asarray(X_count[labels == 1].sum(axis=0)).ravel()
    counts_nonhate = np.asarray(X_count[labels == 0].sum(axis=0)).ravel()
    preferred_class = np.where(counts_hate > counts_nonhate, "Hate", "Non-Hate")

    # Build dataframe
    chi2_df = pd.DataFrame({
        "word": vocab,
        "chi2": chi2_scores,
        "pval": pvals,
        "counts_hate": counts_hate,
        "counts_nonhate": counts_nonhate,
        "preferred_class": preferred_class
    }).sort_values("chi2", ascending=False)

    chi2_results[year] = chi2_df

Results

In [26]:
for year in chi2_results.keys():
    print(f"Top Hate-indicative words for {year}:")
    top_hate_words = chi2_results[year][
        chi2_results[year]["preferred_class"] == "Hate"
    ].head(20)
    print(top_hate_words[["word","chi2","pval","counts_hate","counts_nonhate"]])
    print("\n")

Top Hate-indicative words for 2020:
              word       chi2          pval  counts_hate  counts_nonhate
1343        racism  40.082067  2.435143e-10           70              49
1720       unarmed  29.145497  6.714161e-08           17               2
923         killed  24.670592  6.801413e-07           51              40
1344        racist  24.131098  8.999516e-07           67              62
888          jihad  23.594796  1.189087e-06           11               0
118   antisemitism  22.892573  1.713125e-06           14               2
1626    terrorists  22.892573  1.713125e-06           14               2
397           cops  22.117283  2.564902e-06           29              16
925        killing  20.184521  7.031967e-06           24              12
901          jokes  18.767296  1.476779e-05           12               2
610           feel  18.474315  1.722096e-05           20               9
1612      targeted  16.995391  3.747067e-05           16               6
1757         vi

### Case 2: TF-IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',      # remove common words (optional)
    ngram_range=(1, 2),        # unigrams + bigrams
    #min_df=5                   # ignore very rare terms
)

results = []

for year, df_year in df.groupby("year"):

    # Computing TF-IDF Matrix
    X = vectorizer.fit_transform(df_year["Text"])
    feature_names = np.array(vectorizer.get_feature_names_out())

    hate_idx = df_year["class"] == "Hate"
    nonhate_idx = df_year["class"] == "Non-Hate"

    # Computing mean TF-IDF for each class
    mean_hate = X[hate_idx].mean(axis=0).A1
    mean_nonhate = X[nonhate_idx].mean(axis=0).A1

    diff = mean_hate - mean_nonhate

    # Top words
    top_n = 60
    top_idx = np.argsort(diff)[-top_n:]

    top_words = pd.DataFrame({
        "year": year,
        "word": feature_names[top_idx],
        "tfidf_diff": diff[top_idx],
        "mean_hate": mean_hate[top_idx],
        "mean_nonhate": mean_nonhate[top_idx]
    }).sort_values("tfidf_diff", ascending=False)

    results.append((top_words, year))

In [30]:
top_words

Unnamed: 0,year,word,tfidf_diff,mean_hate,mean_nonhate
59,2022,muslims,0.005927,0.017933,0.012006
58,2022,racism,0.004301,0.006566,0.002265
57,2022,racist,0.003685,0.006978,0.003292
56,2022,white,0.003636,0.010724,0.007088
55,2022,india,0.003385,0.005505,0.002119
54,2022,tacos,0.003096,0.003378,0.000282
53,2022,china,0.002848,0.003218,0.00037
52,2022,antisemitic,0.002729,0.003409,0.000681
51,2022,muslim,0.002692,0.005767,0.003075
50,2022,hindu,0.002686,0.004346,0.001661
