In [4]:
import mlflow
mlflow.set_tracking_uri("http://13.221.75.66:5000/")


In [None]:
# Set experiment before logging anything
mlflow.set_experiment("ML Algos with HPT")

with mlflow.start_run():
    mlflow.log_param("example_param", 42)


In [5]:
import optuna
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import mlflow.sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [6]:
df= pd.read_csv(r'C:\Users\Deepu\OneDrive\Desktop\Capstone1\reddit_preprocessing.csv').dropna()
df.shape

(36750, 7)

In [7]:
ngram_range = (1,3)
max_features = 10000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']


In [8]:
# Step 3: SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

In [9]:
! pip install optuna



In [None]:
def objective_svm(trial):
    params = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"])
    }
    model = SVC(**params, probability=True, random_state=42)
    model.fit(X_train, y_train)
    return accuracy_score(y_test, model.predict(X_test))

study = optuna.create_study(direction="maximize")
study.optimize(objective_svm, n_trials=30)
best_params = study.best_params

with mlflow.start_run(run_name="SVM"):
    best_model = SVC(**best_params, probability=True, random_state=42)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    model_info = mlflow.sklearn.log_model(best_model, artifact_path="best_svm_model")
    print("Best Parameters:", best_params)
    print("Model Saved at:", model_info.model_uri)


[I 2025-08-14 18:32:32,322] A new study created in memory with name: no-name-d69aa695-b31f-413e-afe2-744a04eb7a43
[I 2025-08-14 19:19:56,542] Trial 0 finished with value: 0.7947579792855634 and parameters: {'C': 0.24601807222338803, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 0 with value: 0.7947579792855634.
[I 2025-08-14 20:12:27,892] Trial 1 finished with value: 0.4418727541745931 and parameters: {'C': 0.005914870682555769, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.7947579792855634.
[I 2025-08-14 20:35:11,102] Trial 2 finished with value: 0.5169097442401184 and parameters: {'C': 0.008195824309424497, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.7947579792855634.
