In [1]:
# simple_train.py
"""
Simplified TextCNN training script for Jigsaw Toxic Comment dataset.
- No main() function
- Clean, minimal, beginner-friendly
- NOW includes inline text cleaning (no functions)
- Saves: model.h5 and tokenizer.pkl
"""

import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, optimizers

# ---------------- CONFIG ----------------
DATA_PATH = r"C:\Users\AGASTYA\Downloads\ToxicCommentApp\train.csv"
MODEL_PATH = "model.h5"
TOKENIZER_PATH = "tokenizer.pkl"

NUM_WORDS = 20000
MAX_LEN = 150
EMBED_DIM = 100
BATCH_SIZE = 64
EPOCHS = 4
LABELS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
# ----------------------------------------

print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

# ------------------------------------------------------
# INLINE CLEANING (NO FUNCTIONS — you requested this)
# ------------------------------------------------------
df["comment_text"] = df["comment_text"].fillna("empty").astype(str)

# lowercase
df["comment_text"] = df["comment_text"].str.lower()

# remove URLs
df["comment_text"] = df["comment_text"].str.replace(r"http\S+|www\.\S+", " ", regex=True)

# replace @mentions and hashtags
df["comment_text"] = df["comment_text"].str.replace(r"@\w+", " @user ", regex=True)
df["comment_text"] = df["comment_text"].str.replace(r"#\w+", " #tag ", regex=True)

# expand simple contractions
df["comment_text"] = df["comment_text"].str.replace("can't", "cannot")
df["comment_text"] = df["comment_text"].str.replace("won't", "will not")
df["comment_text"] = df["comment_text"].str.replace("n't", " not")
df["comment_text"] = df["comment_text"].str.replace("'re", " are")
df["comment_text"] = df["comment_text"].str.replace("'s", " is")
df["comment_text"] = df["comment_text"].str.replace("'d", " would")
df["comment_text"] = df["comment_text"].str.replace("'ll", " will")
df["comment_text"] = df["comment_text"].str.replace("'ve", " have")

# remove punctuation except ! ? .
df["comment_text"] = df["comment_text"].str.replace(r"[^a-z0-9\s\!\?\.]", " ", regex=True)

# reduce multiple spaces
df["comment_text"] = df["comment_text"].str.replace(r"\s+", " ", regex=True).str.strip()
# ------------------------------------------------------

# Ensure all labels exist
for col in LABELS:
    if col not in df.columns:
        df[col] = 0

texts = df["comment_text"].values
labels = df[LABELS].values.astype("float32")

# Train/validation split
train_texts, val_texts, y_train, y_val = train_test_split(
    texts, labels, test_size=0.1, random_state=42, shuffle=True
)

print("Fitting tokenizer...")
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Convert text → sequences → padded arrays
X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=MAX_LEN)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), maxlen=MAX_LEN)

print("Training data shape:", X_train.shape)

# ----- Build TextCNN Model -----
def build_textcnn():
    inp = layers.Input(shape=(MAX_LEN,))
    x = layers.Embedding(NUM_WORDS, EMBED_DIM)(inp)
    x = layers.Conv1D(128, 5, activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(len(LABELS), activation="sigmoid")(x)
    return models.Model(inp, out)

model = build_textcnn()
model.compile(optimizer=optimizers.Adam(1e-3),
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

print("Training model...")
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

# Save model + tokenizer
print("Saving model and tokenizer...")
model.save(MODEL_PATH)
with open(TOKENIZER_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

# Evaluate with F1-score
print("Evaluating...")
y_pred_prob = model.predict(X_val)
y_pred = (y_pred_prob >= 0.5).astype(int)

for i, col in enumerate(LABELS):
    f1 = f1_score(y_val[:, i], y_pred[:, i], zero_division=0)
    print(f"{col}: F1 = {f1:.4f}")

print("Macro F1:",
      np.mean([f1_score(y_val[:, i], y_pred[:, i], zero_division=0)
               for i in range(len(LABELS))]))

print("Done.")


Loading dataset...
Fitting tokenizer...
Training data shape: (143613, 150)


Training model...
Epoch 1/4
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 32ms/step - accuracy: 0.9322 - loss: 0.0663 - val_accuracy: 0.9940 - val_loss: 0.0487
Epoch 2/4
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 36ms/step - accuracy: 0.9847 - loss: 0.0456 - val_accuracy: 0.9940 - val_loss: 0.0466
Epoch 3/4
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 36ms/step - accuracy: 0.9660 - loss: 0.0376 - val_accuracy: 0.9940 - val_loss: 0.0481
Epoch 4/4
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 37ms/step - accuracy: 0.8777 - loss: 0.0304 - val_accuracy: 0.9454 - val_loss: 0.0544




Saving model and tokenizer...
Evaluating...
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
toxic: F1 = 0.7681
severe_toxic: F1 = 0.1358
obscene: F1 = 0.8016
threat: F1 = 0.1905
insult: F1 = 0.7327
identity_hate: F1 = 0.0513
Macro F1: 0.44667146427849175
Done.
