In [1]:
import dagshub
dagshub.init(repo_owner='Anmol25', repo_name='youtube-sentiment-analysis', mlflow=True)

import mlflow
mlflow.set_tracking_uri("https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow")

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data/preprocessed/sentiments_preprocessed.csv")
df.head()

Unnamed: 0,clean_comment,category
0,cant believe modi,0
1,karachi total blackout,0
2,couldnt done year modi year increasing unemplo...,0
3,modi talk world tallest statue talk world larg...,-1
4,major announcement modi everyone waiting game ...,-1


In [4]:
len(df)

119892

In [5]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [6]:
mlflow.set_experiment("Exp - 4 Model Selection")

<Experiment: artifact_location='mlflow-artifacts:/8abd5b97755b418e9ba19f56ccae2efd', creation_time=1735110913308, experiment_id='8', last_update_time=1735110913308, lifecycle_stage='active', name='Exp - 4 Model Selection', tags={}>

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [8]:
## Import model libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [9]:
## Remaping outputs
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

### Experiment

In [10]:
ngram_range = (1, 1)  # Unigram setting
max_features = 9000

# Vectorization using TF-IDF with 9000 max features
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# models
models = {
    "XGBClassifier": XGBClassifier(),
    "LGBMClassifier": LGBMClassifier(n_jobs=-1),
    "CatBoostClassifier": CatBoostClassifier(verbose=0),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC": SVC(),
    "LogisticRegression": LogisticRegression(),
    "MultinomialNB": MultinomialNB()
}

for model_name, model in models.items():
    with mlflow.start_run():
        mlflow.set_tag("mlflow.runName", f"{model_name}_TFidf_{max_features}_Unigram")
        mlflow.set_tag("experiment_type", "Model_Selection")
        mlflow.set_tag("model_type", f"{model_name}")

        # Add a description
        mlflow.set_tag("description", f"{model_name} with default settings with tfidf 9000 features unigram")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)
        mlflow.log_param("Algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)

        # Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Unigrams, max_features={max_features}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        ## Create model_signature
        signature = infer_signature(X_train[:1], [y_train.iloc[0]])

        # Log hyperparameters
        try:
            hyperparameters = model.get_params()
            for param_name, param_value in hyperparameters.items():
                mlflow.log_param(param_name, param_value)
        except AttributeError:
            mlflow.log_param("hyperparameters", "Not available for this model")

        # Log model
        if model_name == "XGBClassifier":
            mlflow.xgboost.log_model(xgb_model=model, artifact_path=f"{model_name}_tfidf_Unigrams_{max_features}", signature=signature)
        elif model_name == "LGBMClassifier":
            mlflow.lightgbm.log_model(lgb_model=model, artifact_path=f"{model_name}_tfidf_Unigrams_{max_features}", signature=signature)
        elif model_name == "CatBoostClassifier":
            mlflow.catboost.log_model(cb_model=model, artifact_path=f"{model_name}_tfidf_Unigrams_{max_features}", signature=signature)
        else:
            mlflow.sklearn.log_model(sk_model=model, artifact_path=f"{model_name}_tfidf_Unigrams_{max_features}", signature=signature)



🏃 View run XGBClassifier_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/0ed027869d314956b778bb996125a7fa
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.454672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 262538
[LightGBM] [Info] Number of data points in the train set: 95913, number of used features: 5872
[LightGBM] [Info] Start training from score -1.100271
[LightGBM] [Info] Start training from score -1.097799
[LightGBM] [Info] Start training from score -1.097768




🏃 View run LGBMClassifier_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/de50fe3cb42c42569460270c9acb95fd
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




🏃 View run CatBoostClassifier_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/e3081a6488e54920bcfae451aa9a88fb
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




🏃 View run RandomForestClassifier_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/4e29d870db1f43c2b0c3d5c8c9f62023
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




🏃 View run SVC_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/7586f9dfc9ec4f908715b4a1d8bc3705
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




🏃 View run LogisticRegression_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/8be9476898c54cc395a62b2b86a3b273
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8




🏃 View run MultinomialNB_TFidf_9000_Unigram at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8/runs/8d2f3ba6bd5f4bb3a475aa0126a460a5
🧪 View experiment at: https://dagshub.com/Anmol25/youtube-sentiment-analysis.mlflow/#/experiments/8


#### Best Performing models with time taken for training

<table>
    <tr>
    <th>Model Name</th>
    <th>Accuracy</th>
    <th>Train Duration</th>
    </tr>
    <tr>
        <td>SVC</td>
        <td>0.8783</td>
        <td>58.5min</td>
    </tr>
    <tr>
        <td>Logistic Regression</td>
        <td>0.8764</td>
        <td>1.2min</td>
    </tr>
    <tr>
        <td>CatBoostClassifier</td>
        <td>0.8614</td>
        <td>7.6min</td>
    </tr>
    <tr>
        <td>LightGBM</td>
        <td>0.8585</td>
        <td>1.0min</td>
    </tr>
    <tr>
        <td> RandomForestClassifier </td>
        <td> 0.8302 </td>
        <td> 4.3min </td>
    </tr>
    <tr>
        <td>XGBClassifier</td>
        <td>0.8194</td>
        <td>2.3min</td>
    </tr>
    <tr>
        <td>MultinomialNB</td>
        <td>0.7495</td>
        <td>38.5s</td>
    </tr>
</table>

Clearly SVC has taken the most training time than any other algorithm but is also the best performing algorithm. However, the difference between Logistic Regression and SVC is very less and LoR also have comparatively very less time complexity.

Therefore we will perform Hyperparameter tuning on following algorithm:
- Logistic Regression
- LightGBM
- CatBoostClassifier

We can try hyperparameter tuning on SVC but it will be very computationally expensive, therefore after training above algorithm we may try SVC.