In [None]:
!pip install mlflow  optuna  lightgbm dagshub

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting dagshub
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Download

In [None]:
import dagshub
import mlflow
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)

In [None]:
# Set or create an experiment
mlflow.set_experiment("exp5 Best model HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/72b4a6680da6444f8672000d54583b16', creation_time=1752307675190, experiment_id='10', last_update_time=1752307675190, lifecycle_stage='active', name='exp5 Best model HP Tuning', tags={}>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import optuna
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv("/content/reddit_preprocessing.csv").dropna()
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [None]:
df.shape

(36662, 2)

In [9]:
# Step 1: Clean data
df = df.dropna(subset=['category'])
y = df['category'].map({-1: 2, 0: 0, 1: 1})
X_raw = df['clean_comment']

# Step 2: Train-test split BEFORE vectorization or resampling
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number, max_features):
    with mlflow.start_run():
        # Log model type and trial number with max_features
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_Trigrams_MaxFeatures_{max_features}")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)
        mlflow.log_param("max_features", max_features)

        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # # Save and log the model manually using joblib
        # import joblib
        # model_filename = "XGBoost_trial_Best.pkl"
        # joblib.dump(model, model_filename)
        # mlflow.log_artifact(model_filename)

        return accuracy

# Step 6: Optuna objective function for XGBoost
def objective_xgboost(trial, X_train, X_test, max_features):
    # Hyperparameter space to explore
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True)  # L1 regularization
    reg_lambda = trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True)  # L2 regularization
    gamma = trial.suggest_float('gamma', 1e-4, 10.0, log=True)  # Minimum loss reduction

    # Scale position weight for class imbalance (similar to class_weight='balanced')
    class_counts = np.bincount(y_train)
    scale_pos_weight = class_counts[0] / class_counts[1] if len(class_counts) > 1 else 1.0

    # Log trial parameters
    params = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'colsample_bylevel': colsample_bylevel,
        'colsample_bynode': colsample_bynode,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'gamma': gamma,
        'scale_pos_weight': scale_pos_weight
    }

    # Create XGBoost model
    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        colsample_bylevel=colsample_bylevel,
        colsample_bynode=colsample_bynode,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        gamma=gamma,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'  # Suppress warnings
    )

    # # Log each trial as a separate run in MLflow
    # accuracy = log_mlflow("XGBoost", model, X_train, X_test, y_train, y_test, params, trial.number, max_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Step 7: Run Optuna for XGBoost with different max_features values
def run_optuna_experiment():
    max_features_list = [3000, 10000]

    for max_features in max_features_list:
        print(f"\n=== Running XGBoost experiment with max_features={max_features} ===")

        # Step 3: Vectorization only on training data with current max_features
        vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=max_features)
        X_train = vectorizer.fit_transform(X_train_raw)
        X_test = vectorizer.transform(X_test_raw)

        # Create study for current max_features
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective_xgboost(trial, X_train, X_test, max_features),
                      n_trials=40)

        # Get the best parameters
        best_params = study.best_params

        # Calculate scale_pos_weight for best model
        class_counts = np.bincount(y_train)
        scale_pos_weight = class_counts[0] / class_counts[1] if len(class_counts) > 1 else 1.0

        best_model = XGBClassifier(
            n_estimators=best_params['n_estimators'],
            learning_rate=best_params['learning_rate'],
            max_depth=best_params['max_depth'],
            min_child_weight=best_params['min_child_weight'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            colsample_bylevel=best_params['colsample_bylevel'],
            colsample_bynode=best_params['colsample_bynode'],
            reg_alpha=best_params['reg_alpha'],
            reg_lambda=best_params['reg_lambda'],
            gamma=best_params['gamma'],
            scale_pos_weight=scale_pos_weight,
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss'
        )

        # Log the best model with MLflow and print the classification report
        best_accuracy = log_mlflow("XGBoost", best_model, X_train, X_test, y_train, y_test,
                                  best_params, f"Best-hp-MaxFeatures-{max_features}", max_features)

        print(f"Best accuracy for max_features={max_features}: {best_accuracy:.4f}")
        print(f"Best parameters: {best_params}")

        # Plot parameter importance
        optuna.visualization.plot_param_importances(study).show()

        # Plot optimization history
        optuna.visualization.plot_optimization_history(study).show()

# Run the experiment for XGBoost with different max_features
run_optuna_experiment()


=== Running XGBoost experiment with max_features=3000 ===


[I 2025-07-13 01:34:42,588] A new study created in memory with name: no-name-91968eb7-2560-47e8-bc1a-86e9ee159eee
Parameters: { "scale_pos_weight" } are not used.

[I 2025-07-13 01:35:45,565] Trial 0 finished with value: 0.7057138960861857 and parameters: {'n_estimators': 490, 'learning_rate': 0.027243341648791514, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.5763719294154306, 'colsample_bytree': 0.629745475740739, 'colsample_bylevel': 0.898454751570512, 'colsample_bynode': 0.9766281526346697, 'reg_alpha': 0.004213089923746137, 'reg_lambda': 5.668191317381823, 'gamma': 0.41895216876576064}. Best is trial 0 with value: 0.7057138960861857.
Parameters: { "scale_pos_weight" } are not used.

[I 2025-07-13 01:49:38,767] Trial 1 finished with value: 0.6998499931815082 and parameters: {'n_estimators': 767, 'learning_rate': 0.003991827882407467, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.8110718787991018, 'colsample_bytree': 0.750550933416551, 'colsample_bylevel': 0.8163968

🏃 View run Trial_Best-hp-MaxFeatures-3000_XGBoost_Trigrams_MaxFeatures_3000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/4284600d5ac74f5c9bc4efabcf138fd6
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=3000: 0.8460
Best parameters: {'n_estimators': 391, 'learning_rate': 0.09284610673430381, 'max_depth': 14, 'min_child_weight': 1, 'subsample': 0.6223685991974504, 'colsample_bytree': 0.9145748533032563, 'colsample_bylevel': 0.5821543854551949, 'colsample_bynode': 0.6291459951230686, 'reg_alpha': 0.0013959750627057092, 'reg_lambda': 0.002982834215533974, 'gamma': 0.19733966862439623}



=== Running XGBoost experiment with max_features=10000 ===


[I 2025-07-13 04:54:51,684] A new study created in memory with name: no-name-b1680b5a-fdb5-46a1-83b1-59677759b21d

Parameters: { "scale_pos_weight" } are not used.


[I 2025-07-13 05:01:05,148] Trial 0 finished with value: 0.6065730260466384 and parameters: {'n_estimators': 858, 'learning_rate': 0.0003754954547468287, 'max_depth': 5, 'min_child_weight': 6, 'subsample': 0.8258849343625038, 'colsample_bytree': 0.9454102354443934, 'colsample_bylevel': 0.707011525661584, 'colsample_bynode': 0.7739622585095051, 'reg_alpha': 0.6747958972812315, 'reg_lambda': 0.01425478839622739, 'gamma': 0.015080387987167767}. Best is trial 0 with value: 0.6065730260466384.

Parameters: { "scale_pos_weight" } are not used.


[I 2025-07-13 05:03:02,552] Trial 1 finished with value: 0.7314877948997681 and parameters: {'n_estimators': 348, 'learning_rate': 0.038448599694801654, 'max_depth': 5, 'min_child_weight': 2, 'subsample': 0.7909060228686642, 'colsample_bytree': 0.9846945835853433, 'colsample_bylevel': 0.

🏃 View run Trial_Best-hp-MaxFeatures-10000_XGBoost_Trigrams_MaxFeatures_10000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/8b7a1f30315c4780957fe896bd294c5f
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=10000: 0.8444
Best parameters: {'n_estimators': 645, 'learning_rate': 0.0992856125849396, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.5701166162393628, 'colsample_bytree': 0.9157799963299006, 'colsample_bylevel': 0.8616852672926257, 'colsample_bynode': 0.6794431506557485, 'reg_alpha': 0.9424389486460987, 'reg_lambda': 0.09160131384298396, 'gamma': 0.07834491853463521}
