In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2

import optuna

---

# Read file


In [None]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [None]:
df_train.head()

In [None]:
df_val.head()

---

# Labels

## Train

In [None]:
matrix_labels_train = df_train["label"]

In [None]:
matrix_labels_train.head()

In [None]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

## Val

In [None]:
matrix_labels_val = df_val["label"]

In [None]:
matrix_labels_val.head()

In [None]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

---

# Train, val split


In [None]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [None]:
# Encode labels
le= LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)
y_val = le.transform(y_val).astype(np.int64)

---

# Vectorize


In [None]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    max_features=30000,
)

In [None]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [None]:
# To dense arrays
X_train_vec = X_train_vec.toarray().astype(np.float32)
X_val_vec = X_val_vec.toarray().astype(np.float32)

In [None]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

In [None]:
print("Number of classes (train): ", len(le.classes_))
print("Number of classes (val): ", len(le.classes_))

In [None]:
n_features = X_train_vec.shape[1]
n_classes = len(le.classes_)

---

# FNN


## Model


In [None]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    # First hidden layer
    model.add(Dense(
        params["hidden1"], 
        activation="relu", 
        input_shape=(input_dim,),
        kernel_regularizer=l2(params.get("l2_reg", 0.01))
    ))
    model.add(BatchNormalization())
    model.add(Dropout(params["dropout1"]))

    # Second hidden layer
    if params["n_layers"] >= 2:
        model.add(Dense(
            params["hidden2"], 
            activation="relu",
            kernel_regularizer=l2(params.get("l2_reg", 0.01))
        ))
        model.add(BatchNormalization())
        model.add(Dropout(params["dropout2"]))

    # Third hidden layer
    if params["n_layers"] >= 3:
        model.add(Dense(
            params["hidden3"], 
            activation="relu",
            kernel_regularizer=l2(params.get("l2_reg", 0.01))
        ))
        model.add(BatchNormalization())
        model.add(Dropout(params["dropout3"]))

    # Fourth hidden layer
    if params["n_layers"] >= 4:
        model.add(Dense(
            params["hidden4"], 
            activation="relu",
            kernel_regularizer=l2(params.get("l2_reg", 0.01))
        ))
        model.add(BatchNormalization())
        model.add(Dropout(params["dropout4"]))

    # Single-label multiclass: softmax + sparse categorical crossentropy
    model.add(Dense(output_dim, activation="softmax"))

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [None]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 2, 4),
        "hidden1": trial.suggest_categorical("hidden1", [512, 768, 1024]),
        "hidden2": trial.suggest_categorical("hidden2", [256, 384, 512]),
        "hidden3": trial.suggest_categorical("hidden3", [128, 192, 256]),
        "hidden4": trial.suggest_categorical("hidden4", [64, 96, 128]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.3, 0.5),
        "dropout3": trial.suggest_float("dropout3", 0.2, 0.5),
        "dropout4": trial.suggest_float("dropout4", 0.2, 0.4),
        "l2_reg": trial.suggest_float("l2_reg", 1e-5, 1e-2, log=True),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_classes, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train,
        validation_data=(X_val_vec, y_val),
        epochs=20,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_prob = model.predict(X_val_vec)
    y_val_pred = np.argmax(y_val_prob, axis=1)

    f1 = f1_score(y_val, y_val_pred, average="macro")

    return f1

In [None]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

In [None]:
print("Best f1-macro:", study.best_value)
print("Best trial:", study.best_trial.number)

In [None]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

In [None]:
# Build final model with best hyperparameters
model = build_model(n_features, n_classes, best_params)

In [None]:
# Fit final model
model.fit(
    X_train_vec, y_train, epochs=20, batch_size=best_params["batch_size"], verbose=1
)

## Eval


In [None]:
# Predict
y_prob = model.predict(X_val_vec)
y_pred = np.argmax(y_prob, axis=1)

In [None]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [None]:
matrix_metrics.round(4)

In [None]:
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

## Test


In [None]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-6:-1,:]
df_test

In [None]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
probs = model.predict(samples_vec)
preds = np.argmax(probs, axis=1)

for i, (text, pred_idx) in enumerate(zip(samples, preds)):
    label_name = le.inverse_transform([pred_idx])[0]
    confidence = probs[i][pred_idx]
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted label: {label_name}")
    print(f"\tConfidence: {confidence:.4f}\n")