---

# Lib


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

from xgboost import XGBClassifier
import optuna

---

# Read file


In [None]:
df_train = pd.read_csv(
    "B:\\_Projects\\nlp-project\\data\\raw\\train.csv", encoding="utf-8"
)
df_val = pd.read_csv("B:\\_Projects\\nlp-project\\data\\raw\\val.csv", encoding="utf-8")

In [None]:
df_train.info()

In [None]:
df_val.info()

---

# Labels


## Train


In [None]:
matrix_labels_train = df_train["label"]

In [None]:
matrix_labels_train.head()

In [None]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

## Val


In [None]:
matrix_labels_val = df_val["label"]

In [None]:
matrix_labels_val.head()

In [None]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

---

# Train, val split


In [None]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [None]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)

---

# Vectorize


In [None]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    max_features=30000,
)

In [None]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [None]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

---

# Load Model


In [None]:
# Define a StratifiedKFold splitters
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

---

## Svm


### Model


In [None]:
# SVC
svc = SVC(random_state=42)

In [None]:
# hyperparameters
param_grid = {
    "C": np.linspace(1, 10, 20),
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
}

In [None]:
scoring = "f1_macro"

# Grid search
grid_svm = GridSearchCV(
    svc,
    param_grid=param_grid,
    scoring=scoring,
    cv=cv_splitter,
    n_jobs=-1,
    verbose=1,
)

In [None]:
# Fit
grid_svm.fit(X_train_vec, y_train)

In [None]:
print("Best params:")
for item in grid_svm.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1): {grid_svm.best_score_:.2f}")

### Eval


In [None]:
# Predict
y_pred = grid_svm.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [None]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [None]:
matrix_metrics.round(4)

In [None]:
# Classified report
print(
    classification_report(
        y_val.values, y_pred, target_names=le.classes_, zero_division=0
    )
)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_val, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="coolwarm",
    xticklabels=le.classes_,
    yticklabels=le.classes_,
    cbar_kws={"label": "Count"},
)
plt.title("Confusion matrix", pad=20)
plt.ylabel("True", fontsize=12)
plt.xlabel("Predicted", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### Test


In [None]:
df_test = pd.read_csv("../data/raw/test.csv")
df_test_predict = df_test.copy()
df_test_predict.head()

In [None]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid_svm.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    df_test_predict.loc[i - 1, "predicted_label"] = pred

In [None]:
df_test_predict[["segment", "label", "predicted_label"]].groupby(
    ["segment", "label", "predicted_label"]
).value_counts()

---

## Logistic regression


### Model


In [None]:
log = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
# hyperparameters
param_grid = {
    "C": np.linspace(1, 10, 20),
    "penalty": ["l1", "l2", "elasticnet"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
}

In [None]:
scoring = "f1_macro"

# Grid search
grid_log = GridSearchCV(
    log,
    param_grid=param_grid,
    scoring=scoring,
    cv=cv_splitter,
    n_jobs=-1,
    verbose=0,
)

In [None]:
grid_log.fit(X_train_vec, y_train)

In [None]:
print("Best params:")
for item in grid_log.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1): {grid_log.best_score_:.2f}")

### Eval


In [None]:
# Predict
y_pred = grid_log.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [None]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [None]:
matrix_metrics.round(4)

In [None]:
# Classified report
print(
    classification_report(
        y_val.values, y_pred, target_names=le.classes_, zero_division=0
    )
)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_val, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="coolwarm",
    xticklabels=le.classes_,
    yticklabels=le.classes_,
    cbar_kws={"label": "Count"},
)
plt.title("Confusion matrix", pad=20)
plt.ylabel("True", fontsize=12)
plt.xlabel("Predicted", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### Test


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Code/nlp-prj/test.csv")
df_test_predict = df_test.copy()
df_test_predict.head()

In [None]:
# Try prediction
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid_svm.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    df_test_predict.loc[i - 1, "predicted_label"] = pred

In [None]:
df_test_predict[["segment", "label", "predicted_label"]].groupby(
    ["segment", "label", "predicted_label"]
).value_counts()

---

## Xgboost


### Model


In [None]:
def objective(trial):

    params = {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
    }

    skf = cv_splitter
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_vec, y_train):
        X_tr, X_val = X_train_vec[train_idx], X_train_vec[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred, average="macro")
        f1_scores.append(f1)

    return np.mean(f1_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="xgb_f1_macro")
study.optimize(objective, n_trials=10)

In [None]:
print("Best f1-macro:", study.best_value)
print("Best trial:", study.best_trial.number)

In [None]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

In [None]:
# Build final model with best hyperparameters
best_params.update(
    {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,
    }
)

xgb_model = XGBClassifier(**best_params)

In [None]:
xgb_model.fit(X_train_vec, y_train)

### Eval


In [None]:
y_pred = xgb_model.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [None]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [None]:
matrix_metrics.round(4)

In [None]:
# Classified report
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_val, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="coolwarm",
    xticklabels=le.classes_,
    yticklabels=le.classes_,
    cbar_kws={"label": "Count"},
)
plt.title("Confusion matrix", pad=20)
plt.ylabel("True", fontsize=12)
plt.xlabel("Predicted", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### Test


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Code/nlp-prj/test.csv")
df_test_predict = df_test.copy()
df_test_predict.head()

In [None]:
# Try prediction
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid_svm.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    df_test_predict.loc[i - 1, "predicted_label"] = pred

In [None]:
df_test_predict[["segment", "label", "predicted_label"]].groupby(
    ["segment", "label", "predicted_label"]
).value_counts()