In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import dagshub
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['MLFLOW_TRACKING_URI'] = os.getenv('MLFLOW_TRACKING_URI')
os.environ['MLFLOW_TRACKING_USERNAME'] = os.getenv('MLFLOW_TRACKING_USERNAME')
os.environ['MLFLOW_TRACKING_PASSWORD'] = os.getenv('MLFLOW_TRACKING_PASSWORD')

dagshub.init(repo_owner='ArpitKadam', repo_name='SentimentalAI', mlflow=True)
mlflow.set_experiment("Stacking Model")

<Experiment: artifact_location='mlflow-artifacts:/30a0908252434983abe6d9151b73747c', creation_time=1758407201870, experiment_id='7', last_update_time=1758407201870, lifecycle_stage='active', name='Stacking Model', tags={}>

In [3]:
dataset = pd.read_csv('cleaned_dataset.csv')

# Map categories if needed
dataset['category'] = dataset['category'].map({-1: 2, 0: 0, 1: 1})

# Drop rows where either feature or label is missing
dataset = dataset.dropna(subset=['clean_comment', 'category'])

x = dataset['clean_comment']
y = dataset['category']

# Train/Test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
ngram_range = (1, 2)
max_features = 2000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)


In [4]:
lightgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,
    reg_lambda=0.1,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20,
    random_state=42
)

logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    multi_class='multinomial',
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=200, max_depth=15, class_weight="balanced", random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=10,
    eval_metric="mlogloss", use_label_encoder=False, random_state=42
)

dt_model = DecisionTreeClassifier(
    criterion="gini", max_depth=15, class_weight="balanced", random_state=42
)

nb_model = MultinomialNB()

# Meta-learner
knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

# Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model),
        ('random_forest', rf_model),
        ('xgboost', xgb_model),
        ('decision_tree', dt_model),
        ('naive_bayes', nb_model)
    ],
    final_estimator=knn_meta_learner,
    cv=5
)


In [5]:
models = {
    "lightgbm": lightgbm_model,
    "logistic_regression": logreg_model,
    "random_forest": rf_model,
    "xgboost": xgb_model,
    "decision_tree": dt_model,
    "naive_bayes": nb_model,
    "stacking": stacking_model
}

results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_run") as run:
        # Tags
        mlflow.set_tag("model_type", name)

        # Log vectorizer params (same for all)
        mlflow.log_param("vectorizer", "TfidfVectorizer")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Train model
        model.fit(x_train, y_train)
        preds = model.predict(x_test)

        # Metrics
        acc = accuracy_score(y_test, preds)
        results[name] = acc
        mlflow.log_metric("accuracy", acc)

        # Log classification report
        cls_report = classification_report(y_test, preds, output_dict=True)
        with open(f"{name}_classification_report.txt", "w") as f:
            f.write(str(cls_report))
        mlflow.log_artifact(f"{name}_classification_report.txt")

        # Confusion Matrix
        conf_matrix = confusion_matrix(y_test, preds)
        plt.figure(figsize=(6, 5))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix - {name}")
        plt.savefig(f"{name}_confusion_matrix.png")
        mlflow.log_artifact(f"{name}_confusion_matrix.png")
        plt.close()

        # Log model params
        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        # Log model itself
        signature = infer_signature(x_train[:10].toarray(), model.predict(x_train[:10]))
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=name,
            signature=signature,
            input_example=x_test[:1].toarray()
        )

# =========================
# Find best model
# =========================
best_model_name = max(results, key=results.get)
print("Best model for production:", best_model_name, "with accuracy:", results[best_model_name])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 81985
[LightGBM] [Info] Number of data points in the train set: 25232, number of used features: 1938
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



🏃 View run lightgbm_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/2fc87d6494fe45708c973ac20e835cac
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run logistic_regression_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/4c17fa1683ce450fa332205c46f6d33b
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run random_forest_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/9dba87f58bd84556acc334f964b54bef
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run xgboost_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/1209686d88694a1f920603b22e536208
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run decision_tree_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/a0e8d432033b449b962a232afc1236a0
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run naive_bayes_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/d106383a90254b88a48de838ae91b920
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81985
[LightGBM] [Info] Number of data points in the train set: 25232, number of used features: 1938
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67796
[LightGBM] [Info] Number of data points in the train set: 20185, number of used features: 1925
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67698
[LightGBM] [Info] Number of data points in the train set: 20185, number of used features: 1923
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67924
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1921
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67252
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1920
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67565
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1925
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



🏃 View run stacking_run at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/a4ea2a92f77549dcbabdc243b7041d7c
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7
Best model for production: lightgbm with accuracy: 0.9317262009502023


In [6]:
results

{'lightgbm': 0.9317262009502023,
 'logistic_regression': 0.9241597747668485,
 'random_forest': 0.8140066866091853,
 'xgboost': 0.8945979236318846,
 'decision_tree': 0.7513637163469998,
 'naive_bayes': 0.8407531233503431,
 'stacking': 0.9296146401548477}

In [7]:
with open("model_comparison.json", "w") as f:
    import json
    json.dump(results, f)