In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.pipelines import pipeline
from huggingface_hub import snapshot_download

local_dir = snapshot_download(
    repo_id="tomh/toxigen_hatebert",
    local_files_only=False
)
print(local_dir)
tok_dir = snapshot_download("google-bert/bert-base-uncased", local_files_only=False)
tok = AutoTokenizer.from_pretrained(tok_dir, local_files_only=True, use_fast=True)
mdl  = AutoModelForSequenceClassification.from_pretrained(local_dir)
pipe = pipeline("text-classification", model=mdl, tokenizer=tok)
print(f'The result of testing: {pipe("This is a test.")}')

In [None]:
import numpy as np
import pandas as pd

df=pd.read_pickle('/kaggle/input/implicit-hate-speech-on-latent-hatred/ready_data.pkl')
temp_outs = pipe(df["temp_q"].tolist())
df["temp_q_prob"] = [o["score"] for o in temp_outs]
# counter_sub (list[str] per row): flatten → score → regroup
lens = df["counter_sub"].apply(len).tolist()
flat = [t for lst in df["counter_sub"] for t in lst]
outs = pipe(flat) if flat else []
scores = [o["score"] for o in outs]

it = iter(scores)
df["counter_sub_probs"] = [[next(it) for _ in range(L)] for L in lens]
df.head(3)

cols = ["post", "target", "implied_statement", "temp_q", "counter_sub", "temp_q_prob", "counter_sub_probs"]
df_small = df.loc[:, [c for c in cols if c in df.columns]].copy()
df_small = df_small.rename(columns={
    "temp_q_prob": "enr_parsed",
    "counter_sub_probs": "cs_q_e_parsed"
})

df_small["sigma_q_e"] = df_small.apply(
    lambda r: (
        [float(x) - float(r["enr_parsed"]) for x in r["cs_q_e_parsed"]]
        if isinstance(r["cs_q_e_parsed"], list)
        and all(isinstance(x, (int, float)) for x in r["cs_q_e_parsed"])
        else None
    ),
    axis=1
)
df_small["theta_cf"] = df_small["sigma_q_e"].apply(lambda xs: np.var(xs))

df_small.to_pickle('ready_data_for_latent_hatred.pkl')
df_small.head(5)

In [None]:
df=pd.read_pickle('/kaggle/input/implicit-speech-on-offensive-slang/ready_data.pkl')

temp_outs=pipe(df['temp_q'].to_list())
df["temp_q_prob"] = [o["score"] for o in temp_outs]

# counter_sub (list[str] per row): flatten → score → regroup
lens = df["counter_sub"].apply(len).tolist()
flat = [t for lst in df["counter_sub"] for t in lst]
outs = pipe(flat) if flat else []
scores = [o["score"] for o in outs]

it = iter(scores)
df["counter_sub_probs"] = [[next(it) for _ in range(L)] for L in lens]

cols = ["text", "target_group", "temp_q", "counter_sub", "temp_q_prob", "counter_sub_probs"]
df_small = df.loc[:, [c for c in cols if c in df.columns]].copy()
df_small = df_small.rename(columns={
    "temp_q_prob": "enr_parsed",
    "counter_sub_probs": "cs_q_e_parsed"
})

df_small["sigma_q_e"] = df_small.apply(
    lambda r: (
        [float(x) - float(r["enr_parsed"]) for x in r["cs_q_e_parsed"]]
        if isinstance(r["cs_q_e_parsed"], list)
        and all(isinstance(x, (int, float)) for x in r["cs_q_e_parsed"])
        else None
    ),
    axis=1
)
df_small["theta_cf"] = df_small["sigma_q_e"].apply(lambda xs: np.var(xs))

df_small.to_pickle('ready_data_for_offensive_slang.pkl')
df_small.head(5)

In [None]:
df=pd.read_pickle('/kaggle/input/implicit-hate-speech-on-toxigen/ready_data.pkl')

temp_outs=pipe(df['temp_q'].to_list())
df["temp_q_prob"] = [o["score"] for o in temp_outs]

# counter_sub (list[str] per row): flatten → score → regroup
lens = df["counter_sub"].apply(len).tolist()
flat = [t for lst in df["counter_sub"] for t in lst]
outs = pipe(flat) if flat else []
scores = [o["score"] for o in outs]

it = iter(scores)
df["counter_sub_probs"] = [[next(it) for _ in range(L)] for L in lens]

cols = ["text", "target_group", "temp_q", "counter_sub", "temp_q_prob", "counter_sub_probs"]
df_small = df.loc[:, [c for c in cols if c in df.columns]].copy()
df_small = df_small.rename(columns={
    "temp_q_prob": "enr_parsed",
    "counter_sub_probs": "cs_q_e_parsed"
})

df_small["sigma_q_e"] = df_small.apply(
    lambda r: (
        [float(x) - float(r["enr_parsed"]) for x in r["cs_q_e_parsed"]]
        if isinstance(r["cs_q_e_parsed"], list)
        and all(isinstance(x, (int, float)) for x in r["cs_q_e_parsed"])
        else None
    ),
    axis=1
)
df_small["theta_cf"] = df_small["sigma_q_e"].apply(lambda xs: np.var(xs))

df_small.to_pickle('ready_data_for_toxigen.pkl')
df_small.head(5)