In [None]:
!pip install mlflow  optuna  dagshub

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting dagshub
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Download

In [None]:
import dagshub
import mlflow
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=940c825f-ed6f-4cc1-ba4b-039bba5aacc3&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b81d6a611595ccbb1abbc63cadc6e115fdccf7f819c7debe44ae0b971dcc5838




Output()

In [None]:
# Set or create an experiment
mlflow.set_experiment("exp5 Best model HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/72b4a6680da6444f8672000d54583b16', creation_time=1752307675190, experiment_id='10', last_update_time=1752307675190, lifecycle_stage='active', name='exp5 Best model HP Tuning', tags={}>

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

import mlflow
import mlflow.sklearn
import optuna

In [None]:
df = pd.read_csv('/content/reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [8]:
# Step 1: Clean data
df = df.dropna(subset=['category'])
y = df['category'].map({-1: 2, 0: 0, 1: 1})
X_raw = df['clean_comment']

# Step 2: Train-test split BEFORE vectorization or resampling
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number, max_features):
    with mlflow.start_run():
        # Log model type and trial number with max_features
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_TFIDF_Trigrams_MaxFeatures_{max_features}")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)
        mlflow.log_param("max_features", max_features)

        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # # Save and log the model manually using joblib
        # import joblib
        # model_filename = "SVM_trial_Best.pkl"
        # joblib.dump(model, model_filename)
        # mlflow.log_artifact(model_filename)

        return accuracy

# Step 6: Optuna objective function for SVM
def objective_svm(trial, X_train, X_test, max_features):
    # Hyperparameter space to explore
    C = trial.suggest_float('C', 1e-3, 100.0, log=True)  # Regularization parameter
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])

    # Kernel-specific parameters
    if kernel == 'rbf':
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        degree = None
    elif kernel == 'poly':
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        degree = trial.suggest_int('degree', 2, 5)
    else:  # linear kernel
        gamma = None
        degree = None

    # Log trial parameters
    params = {
        'C': C,
        'kernel': kernel,
        'class_weight': 'balanced'
    }

    if gamma is not None:
        params['gamma'] = gamma
    if degree is not None:
        params['degree'] = degree

    # Create SVM model
    if kernel == 'poly':
        model = SVC(
            C=C,
            kernel=kernel,
            gamma=gamma,
            degree=degree,
            class_weight='balanced',
            random_state=42
        )
    elif kernel == 'linear':
        model = SVC(
            C=C,
            kernel=kernel,
            class_weight='balanced',
            random_state=42
        )
    else:  # rbf kernel
        model = SVC(
            C=C,
            kernel=kernel,
            gamma=gamma,
            class_weight='balanced',
            random_state=42
        )

    # Log each trial as a separate run in MLflow
    # accuracy = log_mlflow("SVM", model, X_train, X_test, y_train, y_test, params, trial.number, max_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Step 7: Run Optuna for SVM with different max_features values
def run_optuna_experiment():
    max_features_list = [1000, 3000, 10000]

    for max_features in max_features_list:
        print(f"\n=== Running SVM experiment with max_features={max_features} ===")

        # Step 3: Vectorization only on training data with current max_features
        vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=max_features)
        X_train = vectorizer.fit_transform(X_train_raw)
        X_test = vectorizer.transform(X_test_raw)

        # Create study for current max_features
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective_svm(trial, X_train, X_test, max_features),
                      n_trials=20)  # 60 trials

        # Get the best parameters
        best_params = study.best_params

        # Create best model with proper parameter handling
        if best_params['kernel'] == 'poly':
            best_model = SVC(
                C=best_params['C'],
                kernel=best_params['kernel'],
                gamma=best_params['gamma'],
                degree=best_params['degree'],
                class_weight='balanced',
                random_state=42
            )
        elif best_params['kernel'] == 'linear':
            best_model = SVC(
                C=best_params['C'],
                kernel=best_params['kernel'],
                class_weight='balanced',
                random_state=42
            )
        else:  # rbf kernel
            best_model = SVC(
                C=best_params['C'],
                kernel=best_params['kernel'],
                gamma=best_params['gamma'],
                class_weight='balanced',
                random_state=42
            )

        # Log the best model with MLflow and print the classification report
        best_accuracy = log_mlflow("SVM", best_model, X_train, X_test, y_train, y_test,
                                  best_params, f"Best-hp-MaxFeatures-{max_features}", max_features)

        print(f"Best accuracy for max_features={max_features}: {best_accuracy:.4f}")
        print(f"Best parameters: {best_params}")

        # Plot parameter importance
        optuna.visualization.plot_param_importances(study).show()

        # Plot optimization history
        optuna.visualization.plot_optimization_history(study).show()

# Run the experiment for SVM with different max_features
run_optuna_experiment()


=== Running SVM experiment with max_features=1000 ===


[I 2025-07-13 01:25:38,298] A new study created in memory with name: no-name-0bab418d-6913-43d9-863e-83cc89f8f507
[I 2025-07-13 01:29:41,729] Trial 0 finished with value: 0.37774444292922404 and parameters: {'C': 1.3024607846166805, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.37774444292922404.
[I 2025-07-13 01:31:56,463] Trial 1 finished with value: 0.7748534024273831 and parameters: {'C': 0.24219745658082037, 'kernel': 'linear'}. Best is trial 1 with value: 0.7748534024273831.
[I 2025-07-13 01:36:02,044] Trial 2 finished with value: 0.4301104595663439 and parameters: {'C': 0.0013424626418625841, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 1 with value: 0.7748534024273831.
[I 2025-07-13 01:43:21,964] Trial 3 finished with value: 0.7758079912723306 and parameters: {'C': 5.468642794854646, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 3 with value: 0.7758079912723306.
[I 2025-07-13 01:45:11,668] Trial 4 finished with value: 0.7801718259920906 and parameters

🏃 View run Trial_Best-hp-MaxFeatures-1000_SVM_TFIDF_Trigrams_MaxFeatures_1000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/2bfecd0c909e477c86c6371cef35cb3b
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=1000: 0.7837
Best parameters: {'C': 12.416952578693815, 'kernel': 'linear'}



=== Running SVM experiment with max_features=3000 ===


[I 2025-07-13 02:52:43,664] A new study created in memory with name: no-name-061a09e2-59b1-436a-bfe5-30bdf0b464e9
[I 2025-07-13 03:03:14,494] Trial 0 finished with value: 0.5475248874948861 and parameters: {'C': 0.6948135403936407, 'kernel': 'poly', 'gamma': 'scale', 'degree': 5}. Best is trial 0 with value: 0.5475248874948861.
[I 2025-07-13 03:08:30,222] Trial 1 finished with value: 0.3448793126960316 and parameters: {'C': 0.32301107001871415, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.5475248874948861.
[I 2025-07-13 03:13:36,062] Trial 2 finished with value: 0.7363971089594982 and parameters: {'C': 0.07668801944830177, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 2 with value: 0.7363971089594982.
[I 2025-07-13 03:25:43,332] Trial 3 finished with value: 0.6863493795172508 and parameters: {'C': 1.5301466622533666, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3}. Best is trial 2 with value: 0.7363971089594982.
[I 2025-07-13 03:34:58,760] Trial 4 finished with

🏃 View run Trial_Best-hp-MaxFeatures-3000_SVM_TFIDF_Trigrams_MaxFeatures_3000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/78bd3eaf9bc34a9c95ce8fdb7cb18333
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=3000: 0.8489
Best parameters: {'C': 1.3699396816390046, 'kernel': 'linear'}



=== Running SVM experiment with max_features=10000 ===


[I 2025-07-13 04:49:49,895] A new study created in memory with name: no-name-76174792-3847-4fad-8446-0ba226a58c05
[I 2025-07-13 04:55:16,967] Trial 0 finished with value: 0.4301104595663439 and parameters: {'C': 0.16717254371260928, 'kernel': 'poly', 'gamma': 'auto', 'degree': 4}. Best is trial 0 with value: 0.4301104595663439.
[I 2025-07-13 05:01:52,275] Trial 1 finished with value: 0.4301104595663439 and parameters: {'C': 1.2458724652086628, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.4301104595663439.
[I 2025-07-13 05:08:45,242] Trial 2 finished with value: 0.6641210964134734 and parameters: {'C': 0.12073321404569722, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2}. Best is trial 2 with value: 0.6641210964134734.
[I 2025-07-13 05:13:44,226] Trial 3 finished with value: 0.7108959498159008 and parameters: {'C': 0.06235814349898362, 'kernel': 'linear'}. Best is trial 3 with value: 0.7108959498159008.
[I 2025-07-13 05:20:22,674] Trial 4 finished with value: 0.43011

🏃 View run Trial_Best-hp-MaxFeatures-10000_SVM_TFIDF_Trigrams_MaxFeatures_10000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10/runs/7a1653807c174135b9576dba56395577
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/10
Best accuracy for max_features=10000: 0.8526
Best parameters: {'C': 2.7164558521495388, 'kernel': 'linear'}
