<a href="https://colab.research.google.com/github/DQYisHangry/NegativeChatsDetection/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install -U transformers



In [None]:
import os
import re
import jieba
import numpy as np
import pandas as pd

import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [None]:
# Preprocessing the dataset
df = pd.read_csv('/content/data/train.csv', encoding="utf-8")

# Dropping duplicate
df = df.drop_duplicates(subset="text").reset_index(drop=True)

# Printing out some basic information
print(f"Total: {len(df)}，negative data ratio: {df.label.mean():.3f}")
print(df.head(10))

# Adding keywords. Some common words in negative game chat.
keywords = [
    # Behavioral行为类
    "挂机", "送人头", "投降", "演员", "掉线", "挂机中", "挂机了", "投",
    "摆烂", "开摆", "练英雄", "秒了", "20投", "别送了",

    # Hatespeech侮辱类
    "蠢", "菜", "尼玛", "你妹", "tm", "nm", "废物", "垃圾", "nt", "sb",
    "脑残", "手残", "眼瞎",

    # Gaming words游戏术语类
    "举报了", "不会玩", "坑货", "演员",

    # Discourse marker语气词
    "呵", "喂", "崩", "?"]

def contains_keyword(text):
    return int(any(word in text for word in keywords))
df["keyword_flag"] = df["text"].apply(contains_keyword)
df["text_len"] = df["text"].apply(len)
df["excl_count"] = df["text"].apply(lambda x: x.count('!'))
df["ques_count"] = df["text"].apply(lambda x: x.count('?'))

# Stratified Train/Validation Split (based on label + keyword)
# We want to ensure both `label` (0/1) and `keyword_flag` (presence of keywords)
# are evenly distributed in training and validation sets.
train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df[["label", "keyword_flag"]],
    random_state=42
)


columns_to_keep = ["text", "label", "keyword_flag"]

# Convert to HuggingFace Dataset from pandas
train_ds = Dataset.from_pandas(train_df[columns_to_keep])
val_ds   = Dataset.from_pandas(val_df[columns_to_keep])

# Rename label column as required by HuggingFace Trainer API
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")

Total: 59951，negative data ratio: 0.378
      qid                text  label
0  100001    我去送了个人头，结果啥也没那到。      1
1  100002         我送人头给你们发育发育      1
2  100003            我送你爷爷们多好      1
3  100004         我送你一个黄金分割率。      1
4  100005          我现在非常想送人头。      1
5  100006           吕布你皮肤送给我吧      0
6  100007          一个个去送我能跟上？      1
7  100008             讼我鲁班的皮肤      0
8  100009  我觉得我能够送后羿，你越比我都能送。      1
9  100010         你这都能死你让我怎么演      1


In [None]:
# Load pre-trained BERT tokenizer (Chinese base model)
MODEL_NAME = "bert-base-chinese"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)

# 定义 jieba 分词 + 清洗.
# Define preprocessing function using Jieba
# This helps segment Chinese words properly before tokenization

def jieba_preprocess(text):
    # Keep only Chinese characters, replace others with space
    cleaned = re.sub(r"[^\u4e00-\u9fa5]", " ", str(text)).lower()
    # Apply jieba tokenization and return space-separated string
    return " ".join(jieba.lcut(cleaned))

# This function will be applied to HuggingFace Datasets using .map()
def preprocess_fn(examples):
  # Applying jieba on each text in the batch
    processed_texts = [jieba_preprocess(t) for t in examples["text"]]
    return tokenizer(
        processed_texts,
        padding="max_length",
        truncation=True,
        max_length=20 #The length for each data is short. The average is about 15
    )

# Map preprocessing function to dataset (parallelized)
# This will automatically add input_ids, attention_mask, etc.
train_ds = train_ds.map(preprocess_fn, batched=True, batch_size=512, num_proc=4)
val_ds   = val_ds.map(preprocess_fn,   batched=True, batch_size=512, num_proc=4)

# Set dataset format to PyTorch tensors
# HuggingFace Trainer expects these keys as torch.Tensor
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_ds.set_format(  type="torch", columns=["input_ids","attention_mask","labels"])


Map (num_proc=4):   0%|          | 0/53955 [00:00<?, ? examples/s]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.830 seconds.
DEBUG:jieba:Loading model cost 0.830 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict

Map (num_proc=4):   0%|          | 0/5996 [00:00<?, ? examples/s]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.843 seconds.
DEBUG:jieba:Loading model cost 0.843 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict

In [None]:

training_args = TrainingArguments(
    output_dir="/kaggle/working/bert_out",
    run_name="game_chat_bert",

    # Training evaluation
    do_train=True,
    do_eval=True,
    num_train_epochs=5, #I'm setting epoch to five.
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,

    learning_rate=2e-5,
    weight_decay=0.01, #Weight decay for regularization

    logging_steps=200,
    save_steps=500,
    save_total_limit=2,

    fp16=True
)

In [None]:
# Define Evaluation Metrics and Load Pretrained Model

from sklearn.metrics import f1_score, precision_score, recall_score

# Define the evaluation metric function for the Trainer
# This will be automatically called during evaluation after each eval_steps
# We use F1 score as the primary metric since the dataset is slightly imbalanced
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "f1":      f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall":    recall_score(labels, preds),
    }

# Load pretrained BERT model for sequence classification (binary)
# This loads "bert-base-chinese" and adds a classification head for 2 classes
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2   # Binary classification (label 0 or 1)
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mqianyundeng719[0m ([33mqianyundeng719-university-of-arizona[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
200,0.6054
400,0.5238
600,0.5279
800,0.5041
1000,0.5023
1200,0.4996
1400,0.493
1600,0.4813
1800,0.4823
2000,0.4791


TrainOutput(global_step=8435, training_loss=0.43846812268015756, metrics={'train_runtime': 444.0494, 'train_samples_per_second': 607.534, 'train_steps_per_second': 18.996, 'total_flos': 2772686912490000.0, 'train_loss': 0.43846812268015756, 'epoch': 5.0})

In [None]:
# Checking metrics
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.577152669429779, 'eval_f1': 0.6187299234000494, 'eval_precision': 0.7033707865168539, 'eval_recall': 0.5522717247463609, 'eval_runtime': 1.6375, 'eval_samples_per_second': 3661.589, 'eval_steps_per_second': 57.403, 'epoch': 5.0}


In [None]:
# Post-training Optimization: LightGBM Meta-classifier

from scipy.special import softmax
import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report


# Predict on validation set using fine-tuned BERT
# This gives us raw logits (before softmax)
outputs = trainer.predict(val_ds)
logits = outputs.predictions
labels = outputs.label_ids

# Convert logits to probabilities (we take the second column: prob of label=1)
probs = softmax(logits, axis=1)[:, 1]  # shape (N,)

# Extract keyword_flag from earlier. (original val_df. Used as meta-feature)
val_keyword_flag = val_df["keyword_flag"].values.reshape(-1, 1)  # shape (N, 1)

# Concatenate features: [BERT probability, keyword_flag]
X_meta = np.hstack([probs.reshape(-1, 1), val_keyword_flag])  # shape (N, 2)

# Train a LightGBM model as meta-classifier
# This helps combine the BERT prediction with our rule-based keyword feature
clf = lgb.LGBMClassifier(
    class_weight='balanced',   # handle class imbalance automatically
    random_state=42
)
clf.fit(X_meta, labels)

#  Evaluate meta-classifier performance
val_preds = clf.predict(X_meta)
f1 = f1_score(labels, val_preds)

print("\n LightGBM Meta-classifier Evaluation:")
print(classification_report(labels, val_preds, digits=4))

[LightGBM] [Info] Number of positive: 2267, number of negative: 3729
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

 LightGBM Meta-classifier Evaluation:
              precision    recall  f1-score   support

           0     0.7789    0.8860    0.8290      3729
           1     0.7577    0.5862    0.6610      2267

    accuracy                         0.7727      5996
   macro avg     0.7683    0.7361    0.7450      5996
weighted avg     0.7709    0.7727    0.7655      5996



