In [1]:
import mlflow
mlflow.set_tracking_uri("http://13.221.75.66:5000/")


  import pkg_resources  # noqa: TID251


In [None]:
import optuna
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import mlflow.sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
df= pd.read_csv(r'C:\Users\Deepu\OneDrive\Desktop\Capstone1\reddit_preprocessing.csv').dropna()
df.shape

(36750, 7)

In [None]:
# ===============================
# Step 2: Vectorization (TF-IDF)
# ===============================
ngram_range = (1, 3)   # Trigrams
max_features = 1000    # Limit vocab size



X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'],df['category'] , test_size=0.2, random_state=42, stratify=df['category']
)

vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
===============================
# Step 3: Apply SMOTE
# ===============================
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# ===============================
# Step 4: Train-test split
# ===============================


In [None]:
# =============================
# Step 5: Optuna Objective Function
# ===============================
def objective_lr(trial):
    params = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),
        "max_iter": trial.suggest_int("max_iter", 100, 1000)
    }
    model = LogisticRegression(**params, solver="lbfgs", multi_class="auto", random_state=42)
    model.fit(X_train, y_train)
    return accuracy_score(y_test, model.predict(X_test))

# ===============================
# Step 6: Hyperparameter Tuning
# ===============================
mlflow.set_experiment("ML Algos with HPT")

study = optuna.create_study(direction="maximize")
study.optimize(objective_lr, n_trials=30)
best_params = study.best_params

# ===============================

In [9]:
def objective_knn(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 30),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "metric": trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"])
    }
    model = KNeighborsClassifier(**params)
    model.fit(X_train, y_train)
    return accuracy_score(y_test, model.predict(X_test))

study = optuna.create_study(direction="maximize")
study.optimize(objective_knn, n_trials=30)
best_params = study.best_params

with mlflow.start_run(run_name="KNN"):
    best_model = KNeighborsClassifier(**best_params)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    model_info = mlflow.sklearn.log_model(best_model, artifact_path="best_knn_model")
    print("Best Parameters:", best_params)
    print("Model Saved at:", model_info.model_uri)


[I 2025-08-14 15:05:04,850] A new study created in memory with name: no-name-75806790-8bd7-4b87-a201-0f4a0d8e67e5
[I 2025-08-14 15:05:15,319] Trial 0 finished with value: 0.5039103783555273 and parameters: {'n_neighbors': 26, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.5039103783555273.
[I 2025-08-14 15:05:24,389] Trial 1 finished with value: 0.5171211160431198 and parameters: {'n_neighbors': 30, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 1 with value: 0.5171211160431198.
[I 2025-08-14 15:05:34,202] Trial 2 finished with value: 0.5110970196575777 and parameters: {'n_neighbors': 23, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.5171211160431198.
[I 2025-08-14 15:05:43,252] Trial 3 finished with value: 0.5138448530965969 and parameters: {'n_neighbors': 22, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.5171211160431198.
[I 2025-08-14 15:05:58,262] Trial 4 finished with value: 0.55

Best Parameters: {'n_neighbors': 1, 'weights': 'distance', 'metric': 'euclidean'}
Model Saved at: runs:/33c853b2f20f4e5da3d1ae6c19ad1a14/best_knn_model
