In [25]:
import mlflow
mlflow.set_tracking_uri("http://13.221.75.66:5000/")


In [26]:
# Set experiment before logging anything
mlflow.set_experiment("ML Algos with HPT")

with mlflow.start_run():
    mlflow.log_param("example_param", 42)


In [27]:
import optuna
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import mlflow.sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [28]:
df= pd.read_csv(r'C:\Users\Deepu\OneDrive\Desktop\Capstone1\reddit_preprocessing.csv').dropna()
df.shape

(36750, 7)

In [29]:
ngram_range = (1,3)
max_features = 10000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']


In [30]:
# Step 3: SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

In [31]:
def objective_lr(trial):
    params = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),
        "max_iter": trial.suggest_int("max_iter", 100, 1000)
    }
    model = LogisticRegression(**params, solver="lbfgs", multi_class="auto", random_state=42)
    model.fit(X_train, y_train)
    return accuracy_score(y_test, model.predict(X_test))

study = optuna.create_study(direction="maximize")
study.optimize(objective_lr, n_trials=30)
best_params = study.best_params

with mlflow.start_run(run_name="LogisticRegression"):
    best_model = LogisticRegression(**best_params, solver="lbfgs", multi_class="auto", random_state=42)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    model_info = mlflow.sklearn.log_model(best_model, artifact_path="best_logisticregression_model")
    print("Best Parameters:", best_params)
    print("Model Saved at:", model_info.model_uri)


[I 2025-08-14 15:29:36,798] A new study created in memory with name: no-name-ed9d998f-868a-41a3-b855-815cd7aa9903


[I 2025-08-14 15:29:38,136] Trial 0 finished with value: 0.7729866835764109 and parameters: {'C': 0.1869125715653317, 'max_iter': 812}. Best is trial 0 with value: 0.7729866835764109.
[I 2025-08-14 15:29:38,647] Trial 1 finished with value: 0.6540900443880786 and parameters: {'C': 0.02025233323395416, 'max_iter': 194}. Best is trial 0 with value: 0.7729866835764109.
[I 2025-08-14 15:30:00,589] Trial 2 finished with value: 0.8522511097019657 and parameters: {'C': 1.7073872284365448, 'max_iter': 706}. Best is trial 2 with value: 0.8522511097019657.
[I 2025-08-14 15:30:01,866] Trial 3 finished with value: 0.5664764320439654 and parameters: {'C': 0.0010188713412973957, 'max_iter': 507}. Best is trial 2 with value: 0.8522511097019657.
[I 2025-08-14 15:30:29,017] Trial 4 finished with value: 0.8673641936165716 and parameters: {'C': 4.401611289344311, 'max_iter': 872}. Best is trial 4 with value: 0.8673641936165716.
[I 2025-08-14 15:30:32,963] Trial 5 finished with value: 0.6028323821602198 a

Best Parameters: {'C': 9.43700287349196, 'max_iter': 287}
Model Saved at: runs:/06b4e8a4a4124b938d77f0d0abb663a2/best_logisticregression_model
