In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

token = os.getenv("TOKEN")

if not token:
    raise ValueError("Token not found in .env")

login(token=token, add_to_git_credential=True)
print("Logged in successfully.")


Logged in successfully.


In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/irlab-udc/metahate/available_metahate.tsv", sep="\t")

# Save as fast Parquet file
df.to_parquet("Data/available_metahate.parquet")

print("Done. Parquet saved at Data/available_metahate.parquet")




Done. Parquet saved at Data/available_metahate.parquet


In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "parquet",
    data_files="Data/available_metahate.parquet"
)

print(dataset)



Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1101165
    })
})


1. Dataset Overview

In [6]:
ds = dataset["train"]
print("Rows:", len(ds))
print("Features:", ds.features)


Rows: 1101165
Features: {'label': Value('int64'), 'text': Value('string')}


2. Missing value analysis (text-only)

In [7]:
import pandas as pd
df = ds.to_pandas()

print(df.isnull().sum())


label    0
text     1
dtype: int64


3. Label distribution (numbers + percentages)

In [8]:
counts = df['label'].value_counts()
percent = df['label'].value_counts(normalize=True) * 100

print("Counts:\n", counts)
print("\nPercentage (%):\n", percent.round(2))


Counts:
 label
0    867876
1    233289
Name: count, dtype: int64

Percentage (%):
 label
0    78.81
1    21.19
Name: proportion, dtype: float64


4. Text Length Analysis (summary stats)

In [9]:
df["text_length"] = df["text"].str.len()
print(df["text_length"].describe())


count    1.101164e+06
mean     2.738461e+02
std      5.131642e+02
min      1.000000e+00
25%      7.500000e+01
50%      1.340000e+02
75%      2.710000e+02
max      2.003000e+04
Name: text_length, dtype: float64


5. Common words (text-only frequency list)

In [10]:
from collections import Counter
import re

def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

counter = Counter()

# sample 50k rows for speed
for text in df['text'].sample(50000, random_state=42):
    counter.update(tokenize(text))

print(counter.most_common(50))  # top 50 words


[('the', 93805), ('to', 58366), ('i', 48886), ('a', 46947), ('and', 45362), ('of', 43056), ('you', 42888), ('is', 35536), ('that', 31614), ('it', 29818), ('in', 29072), ('for', 20189), ('this', 18816), ('t', 18218), ('not', 17706), ('on', 17379), ('s', 16666), ('be', 16312), ('are', 15678), ('as', 14113), ('have', 13788), ('your', 12678), ('with', 12434), ('if', 11057), ('was', 10765), ('but', 10548), ('or', 10012), ('my', 9388), ('article', 8964), ('an', 8669), ('they', 8230), ('so', 7950), ('can', 7939), ('do', 7874), ('from', 7796), ('by', 7762), ('me', 7760), ('at', 7730), ('what', 7582), ('like', 7456), ('all', 7455), ('about', 7448), ('he', 7187), ('wikipedia', 7082), ('user', 6774), ('there', 6772), ('page', 6668), ('just', 6610), ('will', 6516), ('no', 6202)]


6. Compare words used in hate vs non-hate

In [12]:
hate_df = df[df['label'] == 1]["text"]
nonhate_df = df[df['label'] == 0]["text"]

def build_counter(text_series):
    c = Counter()
    for t in text_series.sample(min(50000, len(text_series)), random_state=42):
        c.update(tokenize(t))
    return c

hate_words = build_counter(hate_df)
nonhate_words = build_counter(nonhate_df)

print("Top hate words:", hate_words.most_common(30))
print("\nTop non-hate words:", nonhate_words.most_common(30))


Top hate words: [('you', 48507), ('the', 45339), ('a', 39708), ('i', 39045), ('to', 32943), ('and', 30072), ('is', 24364), ('of', 22539), ('that', 17330), ('it', 16083), ('are', 15965), ('t', 15298), ('in', 14713), ('fuck', 12006), ('your', 11824), ('s', 11510), ('this', 11288), ('for', 10736), ('not', 9911), ('on', 8856), ('my', 8676), ('with', 8523), ('be', 8311), ('have', 8231), ('they', 8077), ('me', 7665), ('nigger', 7579), ('like', 7557), ('user', 7543), ('fucking', 7084)]

Top non-hate words: [('the', 110572), ('to', 66529), ('i', 53351), ('and', 50306), ('a', 49605), ('of', 49531), ('you', 42876), ('is', 39245), ('that', 36199), ('it', 33870), ('in', 33282), ('for', 23501), ('this', 21353), ('not', 20453), ('on', 19958), ('be', 18837), ('t', 18805), ('s', 18291), ('as', 16705), ('are', 16364), ('have', 15959), ('with', 13689), ('if', 12862), ('was', 12519), ('your', 12229), ('but', 12099), ('or', 11684), ('article', 10844), ('my', 10359), ('an', 9209)]


In [23]:
dataset['train'][0:5]

{'label': [0, 0, 0, 0, 0],
 'text': ["!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...",
  '!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!',
  '!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit',
  '!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny',
  '!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;']}