In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import shutil
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [20]:
# --- CONFIGURATION ---
MODEL_ID = "huawei-noah/TinyBERT_General_4L_312D"
OUTPUT_DIR = "/kaggle/working/model_tinybert"

HC3_PATH = "/kaggle/input/hc3-tinybert-bot-detection/hc3_flattened.csv"
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 2


In [17]:
# --- 1. LOAD DATA ---
print("‚è≥ Loading HC3 (balanced)...")

df = pd.read_csv("/kaggle/input/newhc3/hc3_flattened_balanced.csv")

# Safety checks
df.columns = df.columns.str.strip()
assert {"text", "label"}.issubset(df.columns), df.columns

print("Dataset size:", len(df))
print(df["label"].value_counts())

texts = df["text"].astype(str).tolist()
labels = df["label"].tolist()

‚è≥ Loading HC3 (balanced)...
Dataset size: 820
label
1    410
0    410
Name: count, dtype: int64


In [18]:
# --- 2. TOKENIZATION ---
print("‚è≥ Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

encodings = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)

dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
)).shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

‚è≥ Tokenizing...


config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

I0000 00:00:1769245100.099229      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [25]:
# --- 3. MODEL ---
print("‚è≥ Initializing TinyBERT...")

model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=2,
    from_pt=True
)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=["accuracy"]
)


‚è≥ Initializing TinyBERT...


In [26]:
# --- 4. TRAIN ---
print("üöÄ Training on HC3...")
model.fit(dataset, epochs=EPOCHS)

üöÄ Training on HC3...
Epoch 1/2


I0000 00:00:1769245467.892907     249 service.cc:152] XLA service 0x7c0f140d7d70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1769245467.892948     249 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1769245468.054441     249 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1769245468.343730     249 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


<tf_keras.src.callbacks.History at 0x7c0f8015d4c0>

In [27]:
# --- 5. SAVE ---
print("üíæ Saving model...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

shutil.make_archive(
    "/kaggle/working/tinybert_hc3_detector",
    "zip",
    OUTPUT_DIR
)

print("‚úÖ DONE")
print("Download: tinybert_hc3_detector.zip")

üíæ Saving model...
‚úÖ DONE
Download: tinybert_hc3_detector.zip


In [32]:
import os
import shutil
import glob
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [37]:
# --- CONFIGURATION ---
OUTPUT_DIR = "/kaggle/working/tinybert_final_model"
INPUT_ROOT = "/kaggle/input"

In [38]:
# --- 2. SMART SEARCH FOR STAGE 1 MODEL ---
print("‚è≥ Searching for Stage 1 Model (config.json)...")

# We look for 'config.json' which EVERY Transformer model has.
# This works even if Kaggle unzipped it into a weird subfolder.
config_files = glob.glob(f"{INPUT_ROOT}/**/config.json", recursive=True)

if not config_files:
    # Fallback: List directories so you can debug
    print(f"‚ùå Could not find model! Listing {INPUT_ROOT} content:")
    for root, dirs, files in os.walk(INPUT_ROOT):
        print(root, files)
    raise FileNotFoundError("Could not find 'config.json' in your input files.")

# The model directory is wherever config.json lives
STAGE1_MODEL_DIR = os.path.dirname(config_files[0])
print(f"‚úÖ Found Stage 1 Model at: {STAGE1_MODEL_DIR}")


‚è≥ Searching for Stage 1 Model (config.json)...
‚úÖ Found Stage 1 Model at: /kaggle/input/tinybert-stage1


In [39]:
# --- 3. LOAD THE STAGE 1 MODEL ---
print("‚è≥ Loading the 'AI-Smart' TinyBERT...")
try:
    tokenizer = AutoTokenizer.from_pretrained(STAGE1_MODEL_DIR)
    model = TFAutoModelForSequenceClassification.from_pretrained(STAGE1_MODEL_DIR)
except Exception as e:
    print(f"‚ùå Failed to load model from {STAGE1_MODEL_DIR}")
    raise e

‚è≥ Loading the 'AI-Smart' TinyBERT...


In [40]:
# --- 4. LOAD STAGE 2 DATA (Spam/Bot Patterns) ---
print("‚è≥ Downloading Spam Dataset (The 'Dumb Bot' patterns)...")
dataset = load_dataset("sms_spam", split="train")

# Convert to pandas
df = pd.DataFrame(dataset)
# label 0 = human, 1 = spam/bot
print(f"‚úÖ Loaded {len(df)} rows of training data.")

# Prepare Data
texts = df['sms'].astype(str).tolist()
labels = df['label'].tolist()

print("‚è≥ Tokenizing...")
encodings = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=128
)

# Create TF Dataset
dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
)).shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

‚è≥ Downloading Spam Dataset (The 'Dumb Bot' patterns)...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/359k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

‚úÖ Loaded 5574 rows of training data.
‚è≥ Tokenizing...


In [41]:
# --- 5. FINE-TUNE (Training) ---
print("üöÄ Starting Stage 2 Training...")

# Low Learning Rate to preserve Stage 1 knowledge
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# 1 Epoch is enough for spam
model.fit(dataset_tf, epochs=1)

üöÄ Starting Stage 2 Training...


<tf_keras.src.callbacks.History at 0x7c0f010740b0>

In [42]:
# --- 6. EXPORT FINAL MODEL ---
print(f"üíæ Saving FINAL model to {OUTPUT_DIR}...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Create the final downloadable zip
output_filename = "/kaggle/working/tinybert_bot_hunter_final"
shutil.make_archive(output_filename, 'zip', OUTPUT_DIR)

print(f"‚úÖ DONE! Download '{os.path.basename(output_filename)}.zip' from the Output tab.")

üíæ Saving FINAL model to /kaggle/working/tinybert_final_model...
‚úÖ DONE! Download 'tinybert_bot_hunter_final.zip' from the Output tab.
