In [1]:
import pandas as pd
import numpy as np
import warnings
from collections import Counter
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score


# ====== LOAD FILE ======
TRAIN_FILE = r"/kaggle/input/dep-kltn/Depression1040_train.xlsx"
VALID_FILE = r"/kaggle/input/dep-kltn/Depression1040_valid.xlsx"
TEST_FILE  = r"/kaggle/input/dep-kltn/Depression1040_test.xlsx"

train_df = pd.read_excel(TRAIN_FILE)
valid_df = pd.read_excel(VALID_FILE)
test_df  = pd.read_excel(TEST_FILE)

Xtrain_text = train_df["Content"]
Xvalid_text = valid_df["Content"]
Xtest_text  = test_df["Content"]

y_train = train_df["Label"].astype(str)
y_valid = valid_df["Label"].astype(str)
y_test  = test_df["Label"].astype(str)

# Map nhãn
label_order = ["1-Bình thường", "2-Nhẹ", "3-Vừa", "4-Nặng"]
label2id = {lbl: i for i, lbl in enumerate(label_order)}
id2label = {i: lbl for lbl, i in label2id.items()}

y_train = y_train.map(label2id)
y_valid = y_valid.map(label2id)
y_test  = y_test.map(label2id)

# === Hàm tính trọng số mẫu (ưu tiên lớp ít) ===
def make_sample_weight(y):
    cnt = Counter(y)
    w = np.array([1.0 / cnt[yi] for yi in y])
    w *= (len(y) / w.sum())
    return w

w_train = make_sample_weight(y_train)

# === Cấu hình TF-IDF cơ bản ===
BASE_TFIDF = dict(
    ngram_range=(1,2),
    sublinear_tf=True,
    lowercase=True,
    min_df=3,
    max_df=0.95
)

# === Cấu hình Cross-validation ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === Hàm evaluate ===
def evaluate_and_log(name, model):
    preds = model.predict(Xtest_text)
    acc = accuracy_score(y_test, preds)
    f1_macro = f1_score(y_test, preds, average="macro", zero_division=0)
    print(f"\n===== {name} =====")
    print(classification_report(
        y_test, preds,
        labels=[0,1,2,3],
        target_names=label_order,
        digits=3,
        zero_division=0
    ))
    print(f"Accuracy: {acc:.4f} | Macro F1: {f1_macro:.4f}")

In [2]:
from sklearn.naive_bayes import MultinomialNB

pipe_nb = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", MultinomialNB())
])

param_grid_nb = {
    "tfidf__max_features": [5000, 10000, 15000],
    "clf__alpha": [0.1, 0.5, 1.0]
}

grid_nb = GridSearchCV(
    pipe_nb,
    param_grid=param_grid_nb,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_nb.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best NB params:", grid_nb.best_params_, " | best F1:", grid_nb.best_score_)
evaluate_and_log("Naive Bayes", grid_nb.best_estimator_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best NB params: {'clf__alpha': 1.0, 'tfidf__max_features': 5000}  | best F1: 0.3719822429709992

===== Naive Bayes =====
               precision    recall  f1-score   support

1-Bình thường      0.875     0.583     0.700       144
        2-Nhẹ      0.324     0.500     0.393        44
        3-Vừa      0.125     0.294     0.175        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.531       209
    macro avg      0.331     0.344     0.317       209
 weighted avg      0.681     0.531     0.579       209

Accuracy: 0.5311 | Macro F1: 0.3171


In [3]:
from sklearn.tree import DecisionTreeClassifier

pipe_dt = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", DecisionTreeClassifier(random_state=42))
])

param_grid_dt = {
    "tfidf__max_features": [5000, 10000],
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [None, 15, 30],
    "clf__class_weight": [None, "balanced"]
}

grid_dt = GridSearchCV(
    pipe_dt,
    param_grid=param_grid_dt,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_dt.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best DT params:", grid_dt.best_params_, " | best F1:", grid_dt.best_score_)
evaluate_and_log("Decision Tree", grid_dt.best_estimator_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best DT params: {'clf__class_weight': None, 'clf__criterion': 'gini', 'clf__max_depth': None, 'tfidf__max_features': 10000}  | best F1: 0.34462911438953864

===== Decision Tree =====
               precision    recall  f1-score   support

1-Bình thường      0.769     0.785     0.777       144
        2-Nhẹ      0.368     0.318     0.341        44
        3-Vừa      0.000     0.000     0.000        17
       4-Nặng      0.250     0.250     0.250         4

     accuracy                          0.612       209
    macro avg      0.347     0.338     0.342       209
 weighted avg      0.612     0.612     0.612       209

Accuracy: 0.6124 | Macro F1: 0.3420


In [4]:
from sklearn.neighbors import KNeighborsClassifier

pipe_knn = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", KNeighborsClassifier())
])

param_grid_knn = {
    "tfidf__max_features": [5000, 8000],
    "clf__n_neighbors": [3, 5, 10],
    "clf__weights": ["uniform", "distance"]
}

grid_knn = GridSearchCV(
    pipe_knn,
    param_grid=param_grid_knn,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_knn.fit(Xtrain_text, y_train)  # KNN không hỗ trợ sample_weight
print("Best KNN params:", grid_knn.best_params_, " | best F1:", grid_knn.best_score_)
evaluate_and_log("KNN", grid_knn.best_estimator_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best KNN params: {'clf__n_neighbors': 3, 'clf__weights': 'distance', 'tfidf__max_features': 5000}  | best F1: 0.3072455015567768

===== KNN =====
               precision    recall  f1-score   support

1-Bình thường      0.739     0.826     0.780       144
        2-Nhẹ      0.344     0.250     0.289        44
        3-Vừa      0.154     0.118     0.133        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.632       209
    macro avg      0.309     0.299     0.301       209
 weighted avg      0.594     0.632     0.609       209

Accuracy: 0.6316 | Macro F1: 0.3008


In [5]:
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", LogisticRegression(max_iter=300, solver="lbfgs", random_state=42))
])

param_grid_lr = {
    "tfidf__max_features": [5000, 8000, 12000],
    "clf__C": [0.5, 1.0, 2.0],
    "clf__class_weight": [None, "balanced"]
}

grid_lr = GridSearchCV(
    pipe_lr,
    param_grid=param_grid_lr,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best LR params:", grid_lr.best_params_, " | best F1:", grid_lr.best_score_)
evaluate_and_log("Logistic Regression", grid_lr.best_estimator_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best LR params: {'clf__C': 1.0, 'clf__class_weight': None, 'tfidf__max_features': 8000}  | best F1: 0.3552243552751123

===== Logistic Regression =====
               precision    recall  f1-score   support

1-Bình thường      0.790     0.889     0.837       144
        2-Nhẹ      0.366     0.341     0.353        44
        3-Vừa      0.167     0.059     0.087        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.689       209
    macro avg      0.331     0.322     0.319       209
 weighted avg      0.635     0.689     0.658       209

Accuracy: 0.6890 | Macro F1: 0.3191


In [6]:
from sklearn.svm import SVC

pipe_svm = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", SVC())
])

param_grid_svm = {
    "tfidf__max_features": [5000, 8000],
    "clf__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "clf__C": [0.5, 1.0, 2.0],
    "clf__class_weight": [None, "balanced"]
}

grid_svm = GridSearchCV(
    pipe_svm,
    param_grid=param_grid_svm,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_svm.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best SVM params:", grid_svm.best_params_, " | best F1:", grid_svm.best_score_)
evaluate_and_log("SVM", grid_svm.best_estimator_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best SVM params: {'clf__C': 1.0, 'clf__class_weight': None, 'clf__kernel': 'sigmoid', 'tfidf__max_features': 5000}  | best F1: 0.3621118509114746

===== SVM =====
               precision    recall  f1-score   support

1-Bình thường      0.838     0.757     0.796       144
        2-Nhẹ      0.338     0.500     0.404        44
        3-Vừa      0.214     0.176     0.194        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.641       209
    macro avg      0.348     0.358     0.348       209
 weighted avg      0.666     0.641     0.649       209

Accuracy: 0.6411 | Macro F1: 0.3482


In [7]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    "tfidf__max_features": [5000, 8000, 12000],
    "clf__n_estimators": [300, 500, 800, 1200],
    "clf__max_depth": [None, 20, 40],
    "clf__min_samples_leaf": [1, 2, 5],
    "clf__min_samples_split": [2, 5, 10],
    "clf__max_features": ["sqrt", "log2"],
    "clf__class_weight": [None, "balanced", "balanced_subsample"],
}

grid_rf = GridSearchCV(
    pipe_rf,
    param_grid=param_grid_rf,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best RF params:", grid_rf.best_params_, " | best F1:", grid_rf.best_score_)
evaluate_and_log("Random Forest", grid_rf.best_estimator_)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
Best RF params: {'clf__class_weight': 'balanced_subsample', 'clf__max_depth': 20, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 300, 'tfidf__max_features': 12000}  | best F1: 0.2871241790465128

===== Random Forest =====
               precision    recall  f1-score   support

1-Bình thường      0.714     0.729     0.722       144
        2-Nhẹ      0.226     0.273     0.247        44
        3-Vừa      0.000     0.000     0.000        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.560       209
    macro avg      0.235     0.250     0.242       209
 weighted avg      0.540     0.560     0.549       209

Accuracy: 0.5598 | Macro F1: 0.2423


In [8]:
from xgboost import XGBClassifier

pipe_xgb = Pipeline([
    ("tfidf", TfidfVectorizer(**BASE_TFIDF)),
    ("clf", XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        random_state=42,
        eval_metric="mlogloss",
        tree_method="hist"
    ))
])

param_grid_xgb = {
    "tfidf__max_features": [5000, 8000],
    "clf__n_estimators": [200, 400, 600],
    "clf__max_depth": [4, 6, 8],
    "clf__learning_rate": [0.05, 0.1, 0.2],
    "clf__subsample": [0.7, 0.9],
    "clf__colsample_bytree": [0.7, 0.9]
}

grid_xgb = GridSearchCV(
    pipe_xgb,
    param_grid=param_grid_xgb,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(Xtrain_text, y_train, **{"clf__sample_weight": w_train})
print("Best XGB params:", grid_xgb.best_params_, " | best F1:", grid_xgb.best_score_)
evaluate_and_log("XGBoost", grid_xgb.best_estimator_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best XGB params: {'clf__colsample_bytree': 0.9, 'clf__learning_rate': 0.2, 'clf__max_depth': 4, 'clf__n_estimators': 600, 'clf__subsample': 0.7, 'tfidf__max_features': 5000}  | best F1: 0.31626311074972346

===== XGBoost =====
               precision    recall  f1-score   support

1-Bình thường      0.723     0.868     0.789       144
        2-Nhẹ      0.273     0.205     0.234        44
        3-Vừa      0.000     0.000     0.000        17
       4-Nặng      0.000     0.000     0.000         4

     accuracy                          0.641       209
    macro avg      0.249     0.268     0.256       209
 weighted avg      0.555     0.641     0.593       209

Accuracy: 0.6411 | Macro F1: 0.2556
