In [2]:
!pip install mlflow  optuna  lightgbm dagshub

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting dagshub
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Download

In [3]:
import dagshub
import mlflow
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=ff374176-088f-428f-8cc1-49d16ac115fe&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=cd5374497f63b9101541a77bbd736ffce78ff4bd72354bf7de960802897880de




Output()

In [12]:
# Set or create an experiment
mlflow.set_experiment("exp5 Best model HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/72b4a6680da6444f8672000d54583b16', creation_time=1752307675190, experiment_id='10', last_update_time=1752307675190, lifecycle_stage='active', name='exp5 Best model HP Tuning', tags={}>

In [13]:
import pandas as pd

df = pd.read_csv('/content/reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import optuna
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv("/content/reddit_preprocessing.csv").dropna()
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [16]:
df.shape

(36662, 2)

In [18]:
# Step 1: Clean data
df = df.dropna(subset=['category'])
y = df['category'].map({-1: 2, 0: 0, 1: 1})
X_raw = df['clean_comment']

# Step 2: Train-test split BEFORE vectorization or resampling
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number, max_features):
    with mlflow.start_run():
        # Log model type and trial number with max_features
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_TFIDF_Trigrams_MaxFeatures_{max_features}")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)
        mlflow.log_param("max_features", max_features)

        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # # Save and log the model manually using joblib
        # import joblib
        # model_filename = "LogisticRegression_trial_Best.pkl"
        # joblib.dump(model, model_filename)
        # mlflow.log_artifact(model_filename)

        return accuracy

# Step 6: Optuna objective function for Logistic Regression
def objective_logistic_regression(trial, X_train, X_test, max_features):
    # Hyperparameter space to explore
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)  # Regularization strength
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    # Handle elasticnet penalty constraints
    if penalty == 'elasticnet':
        solver = 'saga'  # Only saga supports elasticnet
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
    else:
        l1_ratio = None

    # Handle solver constraints
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        solver = 'liblinear'
    elif penalty == 'l2' and solver == 'liblinear':
        solver = 'saga'

    # Log trial parameters
    params = {
        'C': C,
        'penalty': penalty,
        'solver': solver,
        'max_iter': max_iter
    }

    if l1_ratio is not None:
        params['l1_ratio'] = l1_ratio

    # Create Logistic Regression model
    if penalty == 'elasticnet':
        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            max_iter=max_iter,
            l1_ratio=l1_ratio,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )
    else:
        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            max_iter=max_iter,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )

    # # Log each trial as a separate run in MLflow
    # accuracy = log_mlflow("LogisticRegression", model, X_train, X_test, y_train, y_test, params, trial.number, max_features)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)


# Step 7: Run Optuna for Logistic Regression with different max_features values
def run_optuna_experiment():
    max_features_list = [1000, 3000, 10000]

    for max_features in max_features_list:
        print(f"\n=== Running Logistic Regression experiment with max_features={max_features} ===")

        # Step 3: Vectorization only on training data with current max_features
        vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=max_features)
        X_train = vectorizer.fit_transform(X_train_raw)
        X_test = vectorizer.transform(X_test_raw)

        # Create study for current max_features
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective_logistic_regression(trial, X_train, X_test, max_features),
                      n_trials=40)

        # Get the best parameters
        best_params = study.best_params

        # Create best model with proper parameter handling
        if best_params['penalty'] == 'elasticnet':
            best_model = LogisticRegression(
                C=best_params['C'],
                penalty=best_params['penalty'],
                solver=best_params['solver'],
                max_iter=best_params['max_iter'],
                l1_ratio=best_params['l1_ratio'],
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
        else:
            best_model = LogisticRegression(
                C=best_params['C'],
                penalty=best_params['penalty'],
                solver=best_params['solver'],
                max_iter=best_params['max_iter'],
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )

        # Log the best model with MLflow and print the classification report
        best_accuracy = log_mlflow("LogisticRegression", best_model, X_train, X_test, y_train, y_test,
                                  best_params, f"Best-hp-MaxFeatures-{max_features}", max_features)

        print(f"Best accuracy for max_features={max_features}: {best_accuracy:.4f}")
        print(f"Best parameters: {best_params}")

        # Plot parameter importance
        optuna.visualization.plot_param_importances(study).show()

        # Plot optimization history
        optuna.visualization.plot_optimization_history(study).show()

# Run the experiment for Logistic Regression with different max_features
run_optuna_experiment()


=== Running Logistic Regression experiment with max_features=1000 ===


[I 2025-07-12 11:32:31,687] A new study created in memory with name: no-name-f9f2a2c7-9c50-435e-9cc8-81454af41deb
[I 2025-07-12 11:32:33,739] Trial 0 finished with value: 0.7848084003818355 and parameters: {'C': 0.35569992856987565, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 696}. Best is trial 0 with value: 0.7848084003818355.
[I 2025-07-12 11:32:34,157] Trial 1 finished with value: 0.6890767762171007 and parameters: {'C': 0.00754565037666155, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 427}. Best is trial 0 with value: 0.7848084003818355.
[I 2025-07-12 11:32:34,561] Trial 2 finished with value: 0.7542615573435156 and parameters: {'C': 0.09358589853584216, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 379}. Best is trial 0 with value: 0.7848084003818355.

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.

[I 2025-07-12 11:32:34,782] Trial 3 finished with value: 0.7965362061911905 and parameters: {'C': 1.918675972549966, 'penalty': 'l1

🏃 View run Trial_Best-hp-MaxFeatures-1000_LogisticRegression_TFIDF_Trigrams_MaxFeatures_1000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/906afbabd6b54d658fd8bd54ad2778c9
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=1000: 0.7978
Best parameters: {'C': 0.6382849685370715, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 737}



=== Running Logistic Regression experiment with max_features=3000 ===


[I 2025-07-12 11:38:22,426] A new study created in memory with name: no-name-b91a521f-dff6-4187-8ad7-bebc2f65eb55
[I 2025-07-12 11:38:22,999] Trial 0 finished with value: 0.8434474294286104 and parameters: {'C': 2.079861803711122, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 875}. Best is trial 0 with value: 0.8434474294286104.
[I 2025-07-12 11:38:37,533] Trial 1 finished with value: 0.8546297558979954 and parameters: {'C': 0.6576018845378592, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 179}. Best is trial 1 with value: 0.8546297558979954.
[I 2025-07-12 11:38:38,364] Trial 2 finished with value: 0.7693986090276831 and parameters: {'C': 0.08973004611530447, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 512}. Best is trial 1 with value: 0.8546297558979954.

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.

[I 2025-07-12 11:38:38,477] Trial 3 finished with value: 0.4387017591708714 and parameters: {'C': 0.009525269321135447, 'penalty'

🏃 View run Trial_Best-hp-MaxFeatures-3000_LogisticRegression_TFIDF_Trigrams_MaxFeatures_3000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/c49742e7080541aa8254aba0c6a65c30
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=3000: 0.8664
Best parameters: {'C': 1.4905082648041583, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 507}



=== Running Logistic Regression experiment with max_features=10000 ===


[I 2025-07-12 12:04:24,915] A new study created in memory with name: no-name-be388179-ac4a-4d84-8199-c7fc3d6960e0

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.

[I 2025-07-12 12:04:25,016] Trial 0 finished with value: 0.4301104595663439 and parameters: {'C': 0.0005042215744612928, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 592}. Best is trial 0 with value: 0.4301104595663439.
[I 2025-07-12 12:04:26,770] Trial 1 finished with value: 0.6695758898131733 and parameters: {'C': 0.00015474122354403627, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 630}. Best is trial 1 with value: 0.6695758898131733.
[I 2025-07-12 12:04:27,966] Trial 2 finished with value: 0.7921723714714305 and parameters: {'C': 0.20395935060202972, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 788}. Best is trial 2 with value: 0.7921723714714305.
[I 2025-07-12 12:04:56,512] Trial 3 finished with value: 0.8355379789990454 and parameters: {'C': 0.6373755437464698

🏃 View run Trial_Best-hp-MaxFeatures-10000_LogisticRegression_TFIDF_Trigrams_MaxFeatures_10000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/175c9c9d63de4b64a2adc93d6410c4c9
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=10000: 0.8793
Best parameters: {'C': 1.4611511931417576, 'penalty': 'elasticnet', 'solver': 'saga', 'max_iter': 999, 'l1_ratio': 0.9816964789196673}
