In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import dagshub
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['MLFLOW_TRACKING_URI'] = os.getenv('MLFLOW_TRACKING_URI')
os.environ['MLFLOW_TRACKING_USERNAME'] = os.getenv('MLFLOW_TRACKING_USERNAME')
os.environ['MLFLOW_TRACKING_PASSWORD'] = os.getenv('MLFLOW_TRACKING_PASSWORD')

dagshub.init(repo_owner='ArpitKadam', repo_name='SentimentalAI', mlflow=True)
mlflow.set_experiment("Stacking Model")

<Experiment: artifact_location='mlflow-artifacts:/30a0908252434983abe6d9151b73747c', creation_time=1758407201870, experiment_id='7', last_update_time=1758407201870, lifecycle_stage='active', name='Stacking Model', tags={}>

In [44]:
dataset = pd.read_csv('dataset.csv')

# Map categories if needed
dataset['category'] = dataset['category'].map({-1: 2, 0: 0, 1: 1})

# Drop rows where either feature or label is missing
dataset = dataset.dropna(subset=['clean_comment', 'category'])

x = dataset['clean_comment']
y = dataset['category']

# Train/Test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
ngram_range = (1, 2)
max_features = 2000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)


In [45]:
lightgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,
    reg_lambda=0.1,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20,
    random_state=42
)

logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    multi_class='multinomial',
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=200, max_depth=15, class_weight="balanced", random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=10,
    eval_metric="mlogloss", use_label_encoder=False, random_state=42
)

dt_model = DecisionTreeClassifier(
    criterion="gini", max_depth=15, class_weight="balanced", random_state=42
)

nb_model = MultinomialNB()

# Meta-learner
knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

# Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model),
        ('random_forest', rf_model),
        ('xgboost', xgb_model),
        ('decision_tree', dt_model),
        ('naive_bayes', nb_model)
    ],
    final_estimator=knn_meta_learner,
    cv=5
)


In [46]:
with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName", "StackingClassifier_MultiModels")
    mlflow.set_tag("experiment_type", "Stacking Classifier")
    mlflow.set_tag("model_type", "StackingClassifier")
    mlflow.set_tag("description", "StackingClassifier with multiple base models and KNN meta-learner")

    # Log vectorizer params
    mlflow.log_param("vectorizer", "TfidfVectorizer")
    mlflow.log_param("ngram_range", ngram_range)
    mlflow.log_param("vectorizer_max_features", max_features)

    # Train stacking model
    stacking_model.fit(x_train, y_train)
    y_pred = stacking_model.predict(x_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", acc)

    # Log classification report
    cls_report = classification_report(y_test, y_pred, output_dict=True)
    with open("classification_report.txt", "w") as f:
        f.write(str(cls_report))
    mlflow.log_artifact("classification_report.txt")

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix - StackingClassifier")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()

    # Log final model parameters
    for name, model in stacking_model.estimators:
        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(f"{name}_{param_name}", param_value)

    # Meta-learner params
    meta_params = stacking_model.final_estimator.get_params()
    for param_name, param_value in meta_params.items():
        mlflow.log_param(f"meta_{param_name}", param_value)

    # Log model with input signature
    signature = infer_signature(x_train[:10].toarray(), stacking_model.predict(x_train[:10].toarray()))
    mlflow.sklearn.log_model(
        sk_model=stacking_model,
        artifact_path="stacking_model",
        signature=signature,
        input_example=x_test[:1].toarray()
    )

print(f"Stacking model accuracy: {acc}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038854 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81985
[LightGBM] [Info] Number of data points in the train set: 25232, number of used features: 1938
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67796
[LightGBM] [Info] Number of data points in the train set: 20185, number of used features: 1925
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67698
[LightGBM] [Info] Number of data points in the train set: 20185, number of used features: 1923
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67924
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1921
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67252
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1920
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67565
[LightGBM] [Info] Number of data points in the train set: 20186, number of used features: 1925
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



🏃 View run StackingClassifier_MultiModels at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7/runs/34698cd14bbc421ba16872d6294d8e37
🧪 View experiment at: https://dagshub.com/ArpitKadam/SentimentalAI.mlflow/#/experiments/7
Stacking model accuracy: 0.9296146401548477
