In [1]:
!pip install mlflow boto3 awscli

Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.39.12-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.41.12-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.60.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Download

In [2]:
!aws configure

AWS Access Key ID [None]: AKIAZXEXSXQY6GAS2CEL
AWS Secret Access Key [None]: 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
Default region name [None]: ap-south-1
Default output format [None]: 


In [7]:
import mlflow

mlflow.set_tracking_uri("http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/")

In [8]:
mlflow.set_experiment("Exp 4 - Handling Imbalanced Data")

2025/07/24 11:19:40 INFO mlflow.tracking.fluent: Experiment with name 'Exp 4 - Handling Imbalanced Data' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-server-2003/628397734390164218', creation_time=1753355980393, experiment_id='628397734390164218', last_update_time=1753355980393, lifecycle_stage='active', name='Exp 4 - Handling Imbalanced Data', tags={}>

In [10]:
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [11]:
df=pd.read_csv('/content/reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [12]:
def run_imbalanced_experiment(imbalance_method):
  ngram_range=(1,3)
  max_features=1000

  X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)
  vectorizer = TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)
  X_train_vec = vectorizer.fit_transform(X_train)
  X_test_vec = vectorizer.transform(X_test)

  if imbalance_method == "class_weights":
    class_weight = 'balanced'
  else:
    class_weight = None

    if imbalance_method == "oversampling":
      smote = SMOTE(random_state=42)
      X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)
    elif imbalance_method == "adasyn":
      adasyn = ADASYN(random_state=42)
      X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)
    elif imbalance_method == "undersampling":
      rus = RandomUnderSampler(random_state=42)
      X_train_vec, y_train = rus.fit_resample(X_train_vec, y_train)
    elif imbalance_method == "smote_enn":
      smote_enn = SMOTEENN(random_state=42)
      X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)

  with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"Imbalance_{imbalance_method}_RandomForest_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, imbalance handling method={imbalance_method}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("imbalance_method", imbalance_method)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, class_weight=class_weight)
        model.fit(X_train_vec, y_train)

        # Step 6: Make predictions and log metrics
        y_pred = model.predict(X_test_vec)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Trigrams, Imbalance={imbalance_method}")
        confusion_matrix_filename = f"confusion_matrix_{imbalance_method}.png"
        plt.savefig(confusion_matrix_filename)
        mlflow.log_artifact(confusion_matrix_filename)
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_trigrams_imbalance_{imbalance_method}")

imbalance_methods = ['class-weights','oversampling','adasyn','undersampling','smote_enn']
for method in imbalance_methods:
  run_imbalanced_experiment(method)



🏃 View run Imbalance_class-weights_RandomForest_TFIDF_Trigrams at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218/runs/48f1e7af0558497aa4d2e9242c9b58ee
🧪 View experiment at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218




🏃 View run Imbalance_oversampling_RandomForest_TFIDF_Trigrams at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218/runs/55d0f22201b2427cbeb631391d2366c9
🧪 View experiment at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218




🏃 View run Imbalance_adasyn_RandomForest_TFIDF_Trigrams at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218/runs/b261fab4618b4bb786fd5d38615c0057
🧪 View experiment at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218




🏃 View run Imbalance_undersampling_RandomForest_TFIDF_Trigrams at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218/runs/d945358ea8634bd1813c81599d98ba11
🧪 View experiment at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218




🏃 View run Imbalance_smote_enn_RandomForest_TFIDF_Trigrams at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218/runs/6be5aaa5e65443019c23ebcfe9ec83f6
🧪 View experiment at: http://ec2-3-110-125-29.ap-south-1.compute.amazonaws.com:5000/#/experiments/628397734390164218
