In [1]:
import dagshub
dagshub.init(repo_owner='Anmol25', repo_name='youtube-sentiment-analysis', mlflow=True)

import mlflow
mlflow.set_tracking_uri("https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow")

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data/preprocessed/sentiments_preprocessed.csv")
df.head()

Unnamed: 0,clean_comment,category
0,cant believe modi,0
1,karachi total blackout,0
2,couldnt done year modi year increasing unemplo...,0
3,modi talk world tallest statue talk world larg...,-1
4,major announcement modi everyone waiting game ...,-1


In [4]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [5]:
mlflow.set_experiment("Exp - 5 LightGBM Hyperparameter tuning")

<Experiment: artifact_location='mlflow-artifacts:/bae3345420a94e8685b29c3c3bfeb22d', creation_time=1735132803870, experiment_id='10', last_update_time=1735132803870, lifecycle_stage='active', name='Exp - 5 LightGBM Hyperparameter tuning', tags={}>

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlflow.models.signature import infer_signature
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [7]:
import optuna

In [8]:
## Remaping outputs
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

In [9]:
ngram_range = (1, 1)  # Unigram setting
max_features = 9000

# Vectorization using TF-IDF with 9000 max features
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Optuna LightGBM Hyperparameter tuning

In [10]:
def objective(trial):
    params = {
        "boosting_type": "gbdt",
        "n_jobs": -1,
        "max_depth": trial.suggest_int("max_depth", 2, 30),
        "n_estimators": trial.suggest_int("n_estimators", 100,1000),
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("learning_rate", 0.1,1),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "verbosity": -1
    }

    model = LGBMClassifier(**params)

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

    return scores.mean()

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=10)

[I 2024-12-26 12:11:29,188] A new study created in memory with name: no-name-839323ea-cc8c-431c-9a88-15d6cad0d0c5
[I 2024-12-26 12:14:16,890] Trial 0 finished with value: 0.8563179123367217 and parameters: {'max_depth': 22, 'n_estimators': 407, 'learning_rate': 0.9943785933466827, 'subsample': 0.9274566443588061}. Best is trial 0 with value: 0.8563179123367217.
[I 2024-12-26 12:17:04,808] Trial 1 finished with value: 0.885448295647506 and parameters: {'max_depth': 10, 'n_estimators': 579, 'learning_rate': 0.26086410951144073, 'subsample': 0.16308448134125247}. Best is trial 1 with value: 0.885448295647506.
[I 2024-12-26 12:19:53,448] Trial 2 finished with value: 0.8673068571291573 and parameters: {'max_depth': 7, 'n_estimators': 807, 'learning_rate': 0.9288129702349207, 'subsample': 0.772645336851455}. Best is trial 1 with value: 0.885448295647506.
[I 2024-12-26 12:20:15,853] Trial 3 finished with value: 0.8202433831352856 and parameters: {'max_depth': 2, 'n_estimators': 136, 'learning

In [12]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'max_depth': 10,
 'n_estimators': 579,
 'learning_rate': 0.26086410951144073,
 'subsample': 0.16308448134125247}

In [13]:
import json
with open("best_params.json", "w") as file:
    json.dump(best_params, file)

In [14]:
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "LightGBM HP Tuning using Optuna 10 Trials")
    mlflow.set_tag("experiment_type", "LightGBM HP Tuning")
    mlflow.set_tag("model_type", "LGBMClassifier")

    # Add a description
    mlflow.set_tag("description", "LightGBM HP Tuning with Optuna 10 trials")

    model = LGBMClassifier(**best_params,boosting_type="gbdt",n_jobs=-1,objective="multiclass",num_class=3,verbosity=-1)
    # Log vectorizer parameters
    mlflow.log_param("vectorizer_type", "TF-IDF")
    mlflow.log_param("ngram_range", ngram_range)
    mlflow.log_param("vectorizer_max_features", max_features)
    mlflow.log_param("Algo_name", "LightGBM")

    # Train model
    model.fit(X_train, y_train)

    # Make predictions and log metrics
    y_pred = model.predict(X_test)

    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: LightGBM HP Tuning, max_features={max_features}")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()

    ## Create model_signature
    signature = infer_signature(X_train[:1], [y_train.iloc[0]])

    # Log hyperparameters
    try:
        hyperparameters = model.get_params()
        for param_name, param_value in hyperparameters.items():
            mlflow.log_param(param_name, param_value)
    except AttributeError:
        mlflow.log_param("hyperparameters", "Not available for this model")

    # Log best params
    mlflow.log_artifact("best_params.json")

    mlflow.lightgbm.log_model(lgb_model=model, artifact_path=f"LightGBM best params", signature=signature)
    mlflow.end_run()



🏃 View run LightGBM HP Tuning using Optuna 10 Trials at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/10/runs/81039fc58226401786c962f242fdb7a9
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/10


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      7979
           1       0.91      0.86      0.88      8000
           2       0.90      0.86      0.88      8000

    accuracy                           0.89     23979
   macro avg       0.89      0.89      0.89     23979
weighted avg       0.89      0.89      0.89     23979

