In [None]:
!pip install mlflow optuna dagshub

Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting dagshub
  Downloading dagshub-0.3.40-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading databricks_sdk-0.36.0-py3-none-any.whl.metadata (38 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.met

In [None]:
import dagshub
dagshub.init(repo_owner='dakshvandanarathi', repo_name='YT-Sentiment-Analyser', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=6c36bdaf-c950-47a0-a6f3-995ec7d3e6ca&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=cda0a74f7c9842732b96ff87c2ac1198cca63883d1ddf33be4dd1310edc40d8a




In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import ADASYN
import mlflow
import mlflow.sklearn
import optuna
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Set or create an experiment
mlflow.set_experiment("Exp 4 - ML Algos with HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/498f75c0643c4aa2b0e490aca486b20a', creation_time=1729654450594, experiment_id='5', last_update_time=1729654450594, lifecycle_stage='active', name='Exp 4 - ML Algos with HP Tuning', tags={}>

In [None]:
df = pd.read_csv('/content/preprocessed_data.csv').dropna()
df.shape

(36662, 5)

### Vectorization and Resampling

In [None]:
# Define a function to vectorize the data using TF-IDF
def vectorize_data(X_train, X_val, X_test, max_features, ngram_range):
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    X_train_vec = vectorizer.fit_transform(X_train['comment']).toarray()
    X_val_vec = vectorizer.transform(X_val['comment']).toarray()
    X_test_vec = vectorizer.transform(X_test['comment']).toarray()

    # Combine additional features
    X_train_combined = np.hstack([X_train_vec, X_train[['word_count', 'char_count', 'avg_word_length']].values])
    X_val_combined = np.hstack([X_val_vec, X_val[['word_count', 'char_count', 'avg_word_length']].values])
    X_test_combined = np.hstack([X_test_vec, X_test[['word_count', 'char_count', 'avg_word_length']].values])

    return X_train_combined, X_val_combined, X_test_combined

In [None]:
max_features = 1006
ngram_range = (1, 2)

# Split data into training, validation and testing sets
X = df[['comment', 'word_count', 'char_count', 'avg_word_length']]
y = df['category']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42, stratify=y_temp)

# Vectorize the data
X_train_combined, X_val_combined, X_test_combined = vectorize_data(X_train, X_val, X_test, max_features, ngram_range)

In [None]:
# Apply resampling technique
X_resampled, y_resampled = ADASYN(random_state=42).fit_resample(X_train_combined, y_train)

### Helper Functions

In [None]:
# Define the function that evaluates the model on validation data
def evaluate_model(model, X_val, y_val):
    y_val_pred = model.predict(X_val)  # Predict on validation set
    f1 = f1_score(y_val, y_val_pred, average='macro')  # Calculate F1 (macro)
    accuracy = accuracy_score(y_val, y_val_pred)  # Calculate accuracy
    return f1, accuracy

### Random Forest

In [None]:
# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step = 10)
    max_depth = trial.suggest_int("max_depth", 5, 51, step = 2)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    # Initialize the Random Forest model with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Fit the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the validation set
    f1, accuracy = evaluate_model(model, X_val_combined, y_val)

    return accuracy, f1

In [None]:
# Run Optuna optimization
study = optuna.create_study(directions=["maximize", "maximize"], study_name = "Random_Forest_Optimization")  # Multi-objective optimization for both F1 and accuracy
study.optimize(objective, n_trials=100)

[I 2024-10-23 06:06:06,894] A new study created in memory with name: Random_Forest_Optimization
[I 2024-10-23 06:06:44,424] Trial 0 finished with values: [0.7448627023095108, 0.720028018969344] and parameters: {'n_estimators': 100, 'max_depth': 51, 'min_samples_split': 18, 'min_samples_leaf': 4, 'max_features': 'sqrt'}.
[I 2024-10-23 06:07:30,683] Trial 1 finished with values: [0.7024913620658302, 0.6762297720572708] and parameters: {'n_estimators': 390, 'max_depth': 37, 'min_samples_split': 12, 'min_samples_leaf': 8, 'max_features': 'log2'}.
[I 2024-10-23 06:08:43,265] Trial 2 finished with values: [0.7041280232769594, 0.6760985467663584] and parameters: {'n_estimators': 310, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}.
[I 2024-10-23 06:09:05,402] Trial 3 finished with values: [0.7021276595744681, 0.6742923489673277] and parameters: {'n_estimators': 190, 'max_depth': 33, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2'}.

In [None]:
best_trial = sorted(study.best_trials, key=lambda t: t.values[0], reverse=True)[0]

with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName", "Random Forest")
    mlflow.set_tag("resampling_technique", "Adasyn")
    mlflow.set_tag("vectorizer_type", "TF-IDF")

    # Log best trial parameters
    mlflow.log_params(best_trial.params)

    # Extract parameters from the best trial
    best_params = best_trial.params

    # Initialize the model using the best trial parameters with unpacking (**)
    model = RandomForestClassifier(random_state=42, **best_trial.params)

    # Train the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Predictions on the test set
    y_test_pred = model.predict(X_test_combined)

    # Log classification metrics
    classification_rep = classification_report(y_test, y_test_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_test_pred)

    # Log accuracy
    mlflow.log_metric("accuracy", accuracy)

    # Log each metric from classification report
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Generate and log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - Random Forest")

    # Save and log the confusion matrix plot
    confusion_matrix_filename = "confusion_matrix.png"
    plt.savefig(confusion_matrix_filename)
    mlflow.log_artifact(confusion_matrix_filename)
    plt.close()

2024/10/23 08:37:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5/runs/9494a0dddc9b4b68819756b55e31d56c.
2024/10/23 08:37:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5.


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Define the Optuna objective function for Logistic Regression
def objective_logistic_regression(trial):
    # Hyperparameters to optimize
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    C = trial.suggest_float("C", 1e-5, 1e5, log = True)  # Inverse of regularization strength

    # Initialize the Logistic Regression model with the suggested hyperparameters
    model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver='liblinear',
        random_state=42)

    # Fit the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the validation set
    f1, accuracy = evaluate_model(model, X_val_combined, y_val)

    return accuracy, f1

In [None]:
# Run Optuna optimization for Logistic Regression
study_logistic = optuna.create_study(directions=["maximize", "maximize"], study_name="Logistic_Regression_Optimization")  # Multi-objective optimization for both F1 and accuracy
study_logistic.optimize(objective_logistic_regression, n_trials=50)

[I 2024-10-23 09:47:24,588] A new study created in memory with name: Logistic_Regression_Optimization
[I 2024-10-23 09:47:45,452] Trial 0 finished with values: [0.7995999272595017, 0.7819231904644509] and parameters: {'penalty': 'l1', 'C': 35617.10143195622}.
[I 2024-10-23 09:47:47,397] Trial 1 finished with values: [0.7983269685397345, 0.7806265024162333] and parameters: {'penalty': 'l2', 'C': 4213.02938052691}.
[I 2024-10-23 09:47:49,842] Trial 2 finished with values: [0.7986906710310966, 0.7810631691581748] and parameters: {'penalty': 'l2', 'C': 30262.263613967545}.
[I 2024-10-23 09:48:01,532] Trial 3 finished with values: [0.7995999272595017, 0.7819231904644509] and parameters: {'penalty': 'l1', 'C': 5547.831160681699}.
[I 2024-10-23 09:48:13,988] Trial 4 finished with values: [0.8001454809965448, 0.7824172208549541] and parameters: {'penalty': 'l1', 'C': 40.64299647929848}.
[I 2024-10-23 09:48:22,790] Trial 5 finished with values: [0.8081469358065103, 0.7907890135676885] and param

In [None]:
best_trial = sorted(study_logistic.best_trials, key=lambda t: t.values[0], reverse=True)[0]

with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName", "Logistic Regression")
    mlflow.set_tag("resampling_technique", "Adasyn")
    mlflow.set_tag("vectorizer_type", "TF-IDF")

    # Log best trial parameters
    mlflow.log_params(best_trial.params)

    # Log algorithm name as a parameter
    mlflow.log_param("algo_name", "LogisticRegression")

    # Extract parameters from the best trial
    best_params = best_trial.params

    # Initialize the model using the best trial parameters with unpacking (**)
    model = LogisticRegression(solver='liblinear', random_state=42, **best_trial.params)

    # Train the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Predictions on the test set
    y_test_pred = model.predict(X_test_combined)

    # Log classification metrics
    classification_rep = classification_report(y_test, y_test_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_test_pred)

    # Log accuracy
    mlflow.log_metric("accuracy", accuracy)

    # Log each metric from classification report
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Generate and log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - Logistic Regression")

    # Save and log the confusion matrix plot
    confusion_matrix_filename = "confusion_matrix.png"
    plt.savefig(confusion_matrix_filename)
    mlflow.log_artifact(confusion_matrix_filename)
    plt.close()

2024/10/23 09:51:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5/runs/5412eef1abdd4a4ea0f79fde0b99d8de.
2024/10/23 09:51:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5.


### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to optimize
    alpha = trial.suggest_float("alpha", 1e-4, 1.0)  # Laplace smoothing parameter
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])  # Whether to learn class prior probabilities

    # Initialize the Naive Bayes model with the suggested hyperparameters
    model = MultinomialNB(
        alpha=alpha,
        fit_prior=fit_prior,
    )

    # Fit the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the validation set
    f1, accuracy = evaluate_model(model, X_val_combined, y_val)

    return accuracy, f1

# Run Optuna optimization
study = optuna.create_study(directions=["maximize", "maximize"], study_name="Naive_Bayes_Optimization")
study.optimize(objective, n_trials=60)

[I 2024-10-24 08:25:29,765] A new study created in memory with name: Naive_Bayes_Optimization
[I 2024-10-24 08:25:30,214] Trial 0 finished with values: [0.557919621749409, 0.5191557622053091] and parameters: {'alpha': 0.7080491493292391, 'fit_prior': False}.
[I 2024-10-24 08:25:30,632] Trial 1 finished with values: [0.5664666302964175, 0.5298811868260508] and parameters: {'alpha': 0.4526476824633391, 'fit_prior': False}.
[I 2024-10-24 08:25:31,388] Trial 2 finished with values: [0.574286233860702, 0.53997252314588] and parameters: {'alpha': 0.22093838970238902, 'fit_prior': False}.
[I 2024-10-24 08:25:32,137] Trial 3 finished with values: [0.5686488452445899, 0.5323770101194776] and parameters: {'alpha': 0.4564962332724106, 'fit_prior': True}.
[I 2024-10-24 08:25:32,631] Trial 4 finished with values: [0.5686488452445899, 0.5326604776987353] and parameters: {'alpha': 0.37260180593768427, 'fit_prior': False}.
[I 2024-10-24 08:25:33,073] Trial 5 finished with values: [0.5702855064557192, 

In [None]:
best_trial = sorted(study.best_trials, key=lambda t: t.values[0], reverse=True)[0]

with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName", "Mutlinomial Naive Bayes")
    mlflow.set_tag("resampling_technique", "Adasyn")
    mlflow.set_tag("vectorizer_type", "TF-IDF")

    # Log best trial parameters
    mlflow.log_params(best_trial.params)

    # Log algorithm name as a parameter
    mlflow.log_param("algo_name", "MutlinomialNB")

    # Extract parameters from the best trial
    best_params = best_trial.params

    # Initialize the model using the best trial parameters with unpacking (**)
    model = MultinomialNB(**best_trial.params)

    # Train the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Predictions on the test set
    y_test_pred = model.predict(X_test_combined)

    # Log classification metrics
    classification_rep = classification_report(y_test, y_test_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_test_pred)

    # Log accuracy
    mlflow.log_metric("accuracy", accuracy)

    # Log each metric from classification report
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Generate and log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - Mutlinomial Naive Bayes")

    # Save and log the confusion matrix plot
    confusion_matrix_filename = "confusion_matrix.png"
    plt.savefig(confusion_matrix_filename)
    mlflow.log_artifact(confusion_matrix_filename)
    plt.close()

2024/10/24 08:27:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run Mutlinomial Naive Bayes at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5/runs/4c600b472b3c41a3822ba1719be414f1.
2024/10/24 08:27:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/dakshvandanarathi/YT-Sentiment-Analyser.mlflow/#/experiments/5.


### SVM

In [None]:
from sklearn.svm import SVC

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to optimize
    C = trial.suggest_float("C", 1e-4, 1e2, log=True)  # Regularization strength
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])  # Kernel type

    # Initialize the SVM model with the suggested hyperparameters
    model = SVC(
        C=C,
        kernel=kernel,
        random_state=42
    )

    # Fit the model on the resampled training data
    model.fit(X_resampled, y_resampled)

    # Evaluate the model on the validation set
    f1, accuracy = evaluate_model(model, X_val_combined, y_val)

    return accuracy, f1

In [None]:
# Run Optuna optimization
study_svm = optuna.create_study(directions=["maximize", "maximize"], study_name="SVM_Optimization")
study_svm.optimize(objective, n_trials=50)

[I 2024-10-24 08:28:46,148] A new study created in memory with name: SVM_Optimization
[I 2024-10-24 08:55:30,197] Trial 0 finished with values: [0.4302600472813239, 0.20055096418732785] and parameters: {'C': 0.0002264060589291917, 'kernel': 'rbf'}.
[I 2024-10-24 09:20:15,927] Trial 1 finished with values: [0.3800691034733588, 0.22711097228423652] and parameters: {'C': 0.0001540325763345921, 'kernel': 'poly'}.
[I 2024-10-24 09:52:30,732] Trial 2 finished with values: [0.4057101291143844, 0.2631982937338477] and parameters: {'C': 0.49191525462186, 'kernel': 'poly'}.
[I 2024-10-24 10:17:09,315] Trial 3 finished with values: [0.4813602473176941, 0.4464928778612219] and parameters: {'C': 11.538259929917475, 'kernel': 'rbf'}.
