In [1]:
import dagshub
dagshub.init(repo_owner='Anmol25', repo_name='youtube-sentiment-analysis', mlflow=True)

import mlflow
mlflow.set_tracking_uri("https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow")

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data/preprocessed/sentiments_preprocessed.csv")
df.head()

Unnamed: 0,clean_comment,category
0,cant believe modi,0
1,karachi total blackout,0
2,couldnt done year modi year increasing unemplo...,0
3,modi talk world tallest statue talk world larg...,-1
4,major announcement modi everyone waiting game ...,-1


In [4]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [None]:
mlflow.set_experiment("Exp - 6 Stacking Models ")

<Experiment: artifact_location='mlflow-artifacts:/2a316a77d683478baeb37ec5d412213b', creation_time=1735206252054, experiment_id='13', last_update_time=1735206252054, lifecycle_stage='active', name='Exp - 6 Stacking Models', tags={}>

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlflow.models.signature import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
## Remaping outputs
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

In [8]:
import json
# Best Params for All three models

# Logistic Regression
with open("best_params/LoR.json", "r") as file:
    lor_params = json.load(file)

# LightGBM
with open("best_params/lightgbm.json", "r") as file:
    lgbm_params = json.load(file)

# # CatBoost
# with open("best_params/catboost.json", "r") as file:
#     catboost_params = json.load(file)

### Vectorizer

In [9]:
ngram_range = (1, 1)  # Unigram setting
max_features = 9000

# Vectorization using TF-IDF with 9000 max features
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Experiment

In [10]:
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "Stacking Classifier")
    mlflow.set_tag("Stacking", "LoR + LGBM")

    # Logistic Regression
    lor = LogisticRegression(**lor_params, n_jobs=-1)

    # LightGBM
    lgbm = LGBMClassifier(**lgbm_params, boosting_type="gbdt",
                          n_jobs=-1,objective="multiclass",num_class=3,
                          verbosity=-1)
    
    # CatBoost
    #catboost = CatBoostClassifier(**catboost_params, verbose=0,task_type="GPU",devices="0",early_stopping_rounds=50)

    # Stacking Classifier
    estimators = [
        ('lor', lor),
        ('lgbm', lgbm)
        #('catboost', catboost)
    ]
    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(n_jobs=-1),
                               n_jobs = -1, cv= 5) 

    stack.fit(X_train, y_train)
    y_pred = stack.predict(X_test)

    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: TF-IDF Unigrams, max_features={max_features}")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()

    ## Create model_signature
    signature = infer_signature(X_train[:1], [y_train.iloc[0]])

    # Log hyperparameters
    try:
        hyperparameters = stack.get_params()
        for param_name, param_value in hyperparameters.items():
            mlflow.log_param(param_name, param_value)
    except AttributeError:
        mlflow.log_param("hyperparameters", "Not available for this model")

    # Log model
    mlflow.sklearn.log_model(stack, "Stacked Models", signature=signature)
    mlflow.end_run()



🏃 View run Stacking Classifier at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/13/runs/b0f182a911d44a469dfe3d93e85fcfe3
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/13


In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92      7979
           1       0.91      0.87      0.89      8000
           2       0.90      0.87      0.89      8000

    accuracy                           0.90     23979
   macro avg       0.90      0.90      0.90     23979
weighted avg       0.90      0.90      0.90     23979

