In [27]:
import os, re, pickle, joblib
import numpy as np
import pandas as pd
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import openpyxl

# --- nltk resources (local) ---

needed = [
    ("tokenizers/punkt", "punkt"),
    ("tokenizers/punkt_tab", "punkt_tab"),   # <-- thêm cái này
    ("corpora/stopwords", "stopwords"),
]

for path, pkg in needed:
    try:
        nltk.data.find(path)
    except LookupError:
        nltk.download(pkg)

STOPWORDS = set(stopwords.words("english"))



In [28]:

# --- file input ---
TEST_FILE = r"C:\Users\PC\CS221\Inference_3_pipeline\test_100_stratified_from_test.csv"   
SHEET_NAME = None                # ví dụ: "Sheet1" nếu cần

if TEST_FILE.lower().endswith(".xlsx"):
    tmp = pd.read_excel(TEST_FILE, sheet_name=SHEET_NAME)  # SHEET_NAME có thể None
    # Nếu sheet_name=None => tmp là dict {sheet: df}
    if isinstance(tmp, dict):
        # lấy sheet đầu tiên
        first_sheet = list(tmp.keys())[0]
        print("Detected multiple sheets. Using sheet:", first_sheet)
        df = tmp[first_sheet]
    else:
        df = tmp
else:
    df = pd.read_csv(TEST_FILE)

print("df type:", type(df))
print("columns:", list(df.columns))

# --- columns ---
TEXT_COL = "Tweet"
LABEL_COL = "Final Votes"  # nếu không có label: set = None

assert TEXT_COL in df.columns, f"Thiếu cột '{TEXT_COL}' trong file test!"

# --- nltk resources (local) ---
for pkg in ["punkt", "stopwords"]:
    try:
        nltk.data.find(pkg)
    except LookupError:
        nltk.download(pkg)

STOPWORDS = set(stopwords.words("english"))

def clean_text(text):
    text = str(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)                   # Remove mentions
    text = re.sub(r'#\w+', '', text)                   # Remove hashtags
    text = text.lower()                                # Lowercase
    tokens = word_tokenize(text)                       # Tokenize
    filtered_tokens = [
        w for w in tokens
        if w.isalpha() and w not in STOPWORDS          # Remove stopwords + non-alpha
    ]
    return " ".join(filtered_tokens)

# --- apply cleaning ---
df["tweet_clean"] = df[TEXT_COL].apply(clean_text)

print("Data after cleaning:")
print(df[[TEXT_COL, "tweet_clean"]].head())

# list texts to feed models
texts = df["tweet_clean"].astype(str).fillna("").tolist()


df type: <class 'pandas.core.frame.DataFrame'>
columns: ['Tweet', 'Final Votes']
Data after cleaning:
                                               Tweet  \
0  RT @BirdGang316: If you a bird throw it up #bi...   
1  RT @TheRaceDraft: Karate bitch https://t.co/Ut...   
2  When ratchet bitches find out the club ain't 1...   
3  RT @trevso_electric: You want a hot body? You ...   
4                    We don't pop out bitch we slide   

                                         tweet_clean  
0                                      rt bird throw  
1                                    rt karate bitch  
2                   ratchet bitches find club ai get  
3  rt want hot body want bugatti better born weal...  
4                                    pop bitch slide  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# ---------- 1) Paths tới 3 pipeline artifacts ----------
# ML artifacts (từ notebook: tfidf.pkl + *_tuned.pkl)
ML_DIR   = "random_forest_models"  
PIPE_ML_PKL = r"C:\Users\PC\CS221\outputs_ML\pipe_rf.pkl"

# RNN artifacts (từ notebook: tokenizer_rnn.pkl + *.keras)
RNN_DIR = "rnn_lstm_models"  
TOKENIZER_PKL   = r"C:\Users\PC\CS221\output_RNN_LSTM\LSTM1D_model\tokenizer_lstm1d.pkl"
RNN_MODEL_KERAS = r"C:\Users\PC\CS221\output_RNN_LSTM\LSTM1D_model\lstm1d_hate_speech.keras"

# BERT artifacts (từ notebook: checkpoints/bert_best)
BERT_DIR = r"C:\Users\PC\CS221\outputs_bert\bert_best"

In [30]:
def normalize_label(x: str) -> str:
    x = str(x).strip().lower()
    if x in ["hate", "hate_speech", "hate speech", "hatespeech"]:
        return "Hate"
    if x in ["offensive", "offensive_language", "offensive language", "offensivelanguage"]:
        return "Offensive"
    if x in ["neither", "neutral", "none", "clean"]:
        return "Neither"
    return str(x)

ID2LABEL_ML = {0: "Hate", 1: "Offensive", 2: "Neither"}

In [31]:
def predict_ml(texts, pipe_path=PIPE_ML_PKL):
    """
    Load 1 sklearn Pipeline (.pkl) đã chứa tfidf + classifier.
    Return:
      - preds: list[str] (Hate/Offensive/Neither hoặc label gốc của model)
      - proba: np.ndarray shape (n,3) hoặc None nếu model không hỗ trợ
    """
    pipe = joblib.load(pipe_path)

    # sklearn pipeline có thể predict ra string labels luôn (vì bạn train y là "Hate"/"Offensive"/"neither")
    y_pred = pipe.predict(texts)

    # normalize về chuẩn bạn dùng trong report
    preds = [normalize_label(x) for x in y_pred]

    proba = None
    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(texts)
    return preds, proba

In [32]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def predict_bert(texts, batch_size=32, max_len=256):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained(BERT_DIR)
    model = BertForSequenceClassification.from_pretrained(BERT_DIR)
    model.to(device)
    model.eval()

    all_preds, all_probs = [], []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            logits = model(**enc).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            pred_ids = probs.argmax(axis=1)

        # theo notebook BERT: label2id = {'hate_speech':0,'offensive_language':1,'neither':2}
        ID2LABEL_BERT = {0: "Hate", 1: "Offensive", 2: "Neither"}
        all_preds.extend([ID2LABEL_BERT[int(j)] for j in pred_ids])
        all_probs.append(probs)

    return all_preds, np.vstack(all_probs)

In [34]:
print("len(df) =", len(df))
print("len(pred_ml)  =", len(pred_ml))
print("len(pred_rnn) =", len(pred_rnn))
print("len(pred_bert)=", len(pred_bert))


len(df) = 100
len(pred_ml)  = 100
len(pred_rnn) = 99
len(pred_bert)= 100


In [35]:
# %% 
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np
import pandas as pd

RNN_PREDS_CSV = r"C:\Users\PC\CS221\rnn_preds.csv"   # file bạn đã export

# 1) ML + BERT inference
pred_ml, proba_ml     = predict_ml(texts)
pred_bert, proba_bert = predict_bert(texts)

# 2) Load RNN preds từ CSV rồi merge theo thứ tự dòng
rnn_df = pd.read_csv(RNN_PREDS_CSV)

# cố gắng đoán tên cột dự đoán trong file rnn_preds.csv
cand_cols = [c for c in ["pred_RNN", "rnn_pred", "pred", "prediction", "label"] if c in rnn_df.columns]
if len(cand_cols) == 0:
    # fallback: lấy cột đầu tiên
    rnn_col = rnn_df.columns[0]
else:
    rnn_col = cand_cols[0]

pred_rnn = rnn_df[rnn_col].astype(str).tolist()

# check số dòng khớp
assert len(pred_rnn) == len(df), f"RNN preds rows ({len(pred_rnn)}) != df rows ({len(df)})"

# 3) Build output table
out = df.copy()
out["pred_ML"]   = pred_ml
out["pred_RNN"]  = pred_rnn
out["pred_BERT"] = pred_bert

# normalize label thật nếu có
if LABEL_COL and (LABEL_COL in out.columns):
    out["y_true"] = out[LABEL_COL].apply(normalize_label)

# 4) Save merged predictions
out_path = r"C:\Users\PC\CS221\preds_3pipelines_dataset.csv"
out.to_csv(out_path, index=False)
print("Saved:", out_path)

# 5) Evaluation (nếu có y_true)
if "y_true" in out.columns:
    y_true = out["y_true"].tolist()
    labels = ["Hate", "Offensive", "Neither"]

    def eval_one(name, y_pred):
        print(f"\n=== {name} ===")
        print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("Macro-F1:", round(f1_score(y_true, y_pred, average="macro"), 4))
        print(classification_report(y_true, y_pred, labels=labels, digits=4))

    eval_one("ML",   out["pred_ML"].tolist())
    eval_one("RNN",  out["pred_RNN"].tolist())
    eval_one("BERT", out["pred_BERT"].tolist())

# 6) Disagreement analysis (không cần y_true)
out["all_agree"] = (out["pred_ML"] == out["pred_RNN"]) & (out["pred_RNN"] == out["pred_BERT"])
disagree = out[~out["all_agree"]].copy()

disagree_path = r"C:\Users\PC\CS221\disagree_cases_dataset.csv"
disagree.to_csv(disagree_path, index=False)
print("Saved:", disagree_path, "| n_disagree =", len(disagree))


Saved: C:\Users\PC\CS221\preds_3pipelines_dataset.csv

=== ML ===
Accuracy: 0.72
Macro-F1: 0.5725
              precision    recall  f1-score   support

        Hate     0.2857    0.3333    0.3077         6
   Offensive     0.9474    0.7013    0.8060        77
     Neither     0.4444    0.9412    0.6038        17

    accuracy                         0.7200       100
   macro avg     0.5592    0.6586    0.5725       100
weighted avg     0.8222    0.7200    0.7417       100


=== RNN ===
Accuracy: 0.5
Macro-F1: 0.3075
              precision    recall  f1-score   support

        Hate     0.0000    0.0000    0.0000         6
   Offensive     0.7857    0.5714    0.6617        77
     Neither     0.2069    0.3529    0.2609        17

    accuracy                         0.5000       100
   macro avg     0.3309    0.3081    0.3075       100
weighted avg     0.6402    0.5000    0.5538       100


=== BERT ===
Accuracy: 0.95
Macro-F1: 0.8377
              precision    recall  f1-score   supp

In [36]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

# 1) check cột label thật
print("Columns:", list(out.columns))
assert LABEL_COL in out.columns, f"File test không có cột label thật '{LABEL_COL}'"
assert "pred_BERT" in out.columns, "Chưa có cột pred_BERT (chưa chạy predict_bert hoặc chưa merge)"

# 2) tạo y_true chuẩn
out["y_true"] = out[LABEL_COL].apply(normalize_label)

# 3) eval riêng cho BERT (và in luôn ML/RNN nếu muốn)
labels = ["Hate", "Offensive", "Neither"]
y_true = out["y_true"].tolist()

def eval_one(name, y_pred):
    print(f"\n=== {name} ===")
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
    print("Macro-F1:", round(f1_score(y_true, y_pred, average="macro"), 4))
    print(classification_report(y_true, y_pred, labels=labels, digits=4))

eval_one("BERT", out["pred_BERT"].tolist())
# eval_one("ML", out["pred_ML"].tolist())
# eval_one("RNN", out["pred_RNN"].tolist())


Columns: ['Tweet', 'Final Votes', 'tweet_clean', 'pred_ML', 'pred_RNN', 'pred_BERT', 'y_true', 'all_agree']

=== BERT ===
Accuracy: 0.95
Macro-F1: 0.8377
              precision    recall  f1-score   support

        Hate     0.6000    0.5000    0.5455         6
   Offensive     0.9615    0.9740    0.9677        77
     Neither     1.0000    1.0000    1.0000        17

    accuracy                         0.9500       100
   macro avg     0.8538    0.8247    0.8377       100
weighted avg     0.9464    0.9500    0.9479       100

