In [1]:
import dagshub
dagshub.init(repo_owner='Anmol25', repo_name='youtube-sentiment-analysis', mlflow=True)

import mlflow
mlflow.set_tracking_uri("https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow")

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data/preprocessed/sentiments_preprocessed.csv")
df.head()

Unnamed: 0,clean_comment,category
0,cant believe modi,0
1,karachi total blackout,0
2,couldnt done year modi year increasing unemplo...,0
3,modi talk world tallest statue talk world larg...,-1
4,major announcement modi everyone waiting game ...,-1


In [4]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [5]:
mlflow.set_experiment("Exp - 5 Logistic Regression Hyperparameter tuning")

<Experiment: artifact_location='mlflow-artifacts:/73cbb7ad86244a1fbdbf498c038a84bf', creation_time=1735124722438, experiment_id='9', last_update_time=1735124722438, lifecycle_stage='active', name='Exp - 5 Logistic Regression Hyperparameter tuning', tags={}>

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlflow.models.signature import infer_signature
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [7]:
import optuna

In [8]:
## Remaping outputs
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

In [9]:
ngram_range = (1, 1)  # Unigram setting
max_features = 9000

# Vectorization using TF-IDF with 9000 max features
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Optuna HP Tuning

In [12]:
def objective(trial):
    params = {
        "penalty" : 'l2',
        "max_iter" : trial.suggest_int("max_iter", 100,1000),
        "C" : trial.suggest_float("C", 0.01, 10),
        "solver" : trial.suggest_categorical("solver", ["lbfgs","newton-cg","sag","saga"]),
        "class_weight" : None,
        "n_jobs" : -1
    }

    # Define model
    model = LogisticRegression(**params)

    # score
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")

    return scores.mean()

In [13]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=100)

[I 2024-12-25 17:43:24,302] A new study created in memory with name: no-name-fb694836-a29c-4034-82f3-912b3fe2827a
[I 2024-12-25 17:43:31,321] Trial 0 finished with value: 0.8759813895493489 and parameters: {'max_iter': 191, 'C': 6.489567088634855, 'solver': 'saga'}. Best is trial 0 with value: 0.8759813895493489.
[I 2024-12-25 17:43:44,177] Trial 1 finished with value: 0.8784419596181199 and parameters: {'max_iter': 404, 'C': 3.8912807881962688, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8784419596181199.
[I 2024-12-25 17:43:50,056] Trial 2 finished with value: 0.875480941013436 and parameters: {'max_iter': 338, 'C': 7.013393830152466, 'solver': 'sag'}. Best is trial 1 with value: 0.8784419596181199.
[I 2024-12-25 17:43:56,060] Trial 3 finished with value: 0.8756373338305936 and parameters: {'max_iter': 194, 'C': 6.906998425353369, 'solver': 'sag'}. Best is trial 1 with value: 0.8784419596181199.
[I 2024-12-25 17:44:01,835] Trial 4 finished with value: 0.874021278886992 and para

In [14]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'max_iter': 545, 'C': 4.525772411740373, 'solver': 'lbfgs'}

In [24]:
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "LoR HP with l2")
    mlflow.set_tag("experiment_type", "LoR HP Tuning")
    mlflow.set_tag("model_type", "LoR")

    # Add a description
    mlflow.set_tag("description", "LoR HP Tuning with only L2 Reg")

    model = LogisticRegression(**best_params, n_jobs=-1)

    # Log vectorizer parameters
    mlflow.log_param("vectorizer_type", "TF-IDF")
    mlflow.log_param("ngram_range", ngram_range)
    mlflow.log_param("vectorizer_max_features", max_features)
    mlflow.log_param("Algo_name", "LoR")

    model.fit(X_train,y_train)

    # Make predictions and log metrics
    y_pred = model.predict(X_test)

    # Log accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: TF-IDF Unigrams, max_features={max_features}")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()

    ## Create model_signature
    signature = infer_signature(X_train[:1], y_train.iloc[0])

    # Log hyperparameters
    try:
        hyperparameters = model.get_params()
        for param_name, param_value in hyperparameters.items():
            mlflow.log_param(param_name, param_value)
    except AttributeError:
        mlflow.log_param("hyperparameters", "Not available for this model")

    # Log Model
    mlflow.sklearn.log_model(sk_model=model, artifact_path="LoR with Best params with l2 Reg", signature=signature)




🏃 View run LoR HP with l2 at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/9/runs/17d1e39f47de4e44a4b2bbb68d3bbba1
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/9


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      7979
           1       0.90      0.85      0.88      8000
           2       0.89      0.85      0.87      8000

    accuracy                           0.88     23979
   macro avg       0.89      0.88      0.88     23979
weighted avg       0.89      0.88      0.88     23979

