In [1]:
import pandas as pd
import re
import string
from collections import Counter
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Corpus

In [2]:
data_path = "v1.tsv"
df = pd.read_csv(data_path, sep="\t")

df = df[df["language"] == 1]   # filter only english

# parse date column
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])

# year and quarter
df["year"] = df["date"].dt.year
df["quarter"] = df["date"].dt.quarter

# Assigning class
def assign_class(row):
    votes = {
        "Normal": row["label_0"],
        "Offensive": row["label_1"],
        "Hate": row["label_2"]
    }
    max_vote = max(votes.values())
    candidates = [cls for cls, v in votes.items() if v == max_vote]

    # Priority: Hate > Offensive > Normal
    if "Hate" in candidates:
        return "Hate"
    elif "Offensive" in candidates:
        return "Offensive"
    else:
        return "Normal"

df["class"] = df.apply(assign_class, axis=1)

  exec(code_obj, self.user_global_ns, self.user_ns)


### Case 1: PMI
For this case we will do some preprocessing for better results

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import numpy as np
import spacy
import re

nlp = spacy.load("en_core_web_sm")


# Group by year and compute chi2 rankings
chi2_results = {}

def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text.lower())
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if 
                     (token.pos_ == "NOUN" or token.pos_ == "ADJ") and 
                     not token.is_stop and 
                     len(token) > 2 and len(token) < 7 and
                     not re.search(r"\d", token.text)]) # The token does not contain digits

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np

chi2_results = {}

for year, df_y in df.groupby("year"):
    texts = df_y["text"].astype(str).tolist()
    labels = (df_y["class"] == "Hate").astype(int).values  # 1 = Hate, 0 = Non-Hate

    # Skip if only one class is present
    if len(set(labels)) < 2:
        print(f"Skipping year {year}: only one class present")
        continue

    # Vectorize with adaptive min_df
    min_df_val = min(5, max(1, len(df_y)//10))
    count_vec = CountVectorizer(min_df=min_df_val, ngram_range=(1,1))
    X_count = count_vec.fit_transform(texts)

    if X_count.shape[1] == 0:
        print(f"Skipping year {year}: no features after vectorization")
        continue

    vocab = np.array(count_vec.get_feature_names_out())

    # Chi² test
    chi2_scores, pvals = chi2(X_count, labels)

    # Compute per-class counts
    counts_hate = np.asarray(X_count[labels == 1].sum(axis=0)).ravel()
    counts_nonhate = np.asarray(X_count[labels == 0].sum(axis=0)).ravel()
    preferred_class = np.where(counts_hate > counts_nonhate, "Hate", "Non-Hate")

    # Build dataframe
    chi2_df = pd.DataFrame({
        "word": vocab,
        "chi2": chi2_scores,
        "pval": pvals,
        "counts_hate": counts_hate,
        "counts_nonhate": counts_nonhate,
        "preferred_class": preferred_class
    }).sort_values("chi2", ascending=False)

    chi2_results[year] = chi2_df

# Example: top 20 Hate-indicative words for 2020
year_example = 2020
top_hate_words = chi2_results[year_example][
    chi2_results[year_example]["preferred_class"] == "Hate"
].head(20)

print(top_hate_words[["word","chi2","pval","counts_hate","counts_nonhate"]])



                        word        chi2          pval  counts_hate  \
2329                   kurds  128.647972  8.097924e-30           33   
4170                 traitor   50.529520  1.173858e-12           12   
1425       erdogankillskurds   49.786561  1.714140e-12            6   
4217  turkeyinvadeskurdistan   48.852592  2.759405e-12           10   
2326                    kurd   44.012844  3.262281e-11            8   
3475                     rss   41.795103  1.013570e-10            9   
460                 bastards   26.846580  2.202637e-07            5   
4475                    wipe   25.148367  5.308459e-07            7   
2776                   nazis   25.148367  5.308459e-07            7   
3583            senatemajldr   24.976936  5.802025e-07            4   
1477                executed   22.316079  2.312620e-06            5   
2735               murdering   19.541037  9.846162e-06            4   
2101                increase   18.819137  1.437178e-05            5   
1560  

In [24]:
chi2_results[2018][chi2_results[2018]["preferred_class"] == "Hate"]

Unnamed: 0,word,chi2,pval,counts_hate,counts_nonhate,preferred_class
1416,kurds,67.920732,1.702021e-16,14,13,Hate
1861,pkk,55.680713,8.525057e-14,5,0,Hate
2537,treason,44.748894,2.239934e-11,5,1,Hate
2560,turkeyinvadeskurdistan,26.648925,2.439864e-07,5,4,Hate
2528,traitor,26.648925,2.439864e-07,5,4,Hate
1833,pedophile,22.140923,2.53351e-06,4,3,Hate
220,asset,22.140923,2.53351e-06,4,3,Hate
2704,whoever,17.716895,2.563398e-05,3,2,Hate
2686,welcome,17.716895,2.563398e-05,3,2,Hate
63,abusing,17.716895,2.563398e-05,3,2,Hate
