In [4]:
import os
os.chdir("..")

In [17]:
import os
os.environ["DAGSHUB_DISABLE_SSL_VERIFY"] = "true"

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow
import mlflow.sklearn
import dagshub

In [18]:
dagshub.init(
    repo_owner='ay747283',
    repo_name='HealthKart-recommendation-system',
    mlflow=True
)

mlflow.set_tracking_uri(
    'https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow'
)

mlflow.set_experiment('Text Classification - TFIDF')


<Experiment: artifact_location='mlflow-artifacts:/1ff1845101d3411b9a8fb2b54b297e71', creation_time=1766593438651, experiment_id='0', last_update_time=1766593438651, lifecycle_stage='active', name='Text Classification - TFIDF', tags={}>

In [19]:
df = pd.read_csv("data/featured_data/featured_reviews.csv")
df.dropna(inplace=True)

X = df['transformed_text']
y = df['Review_encoded']


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),

    "LinearSVM": LinearSVC(
        class_weight="balanced"
    ),

    # "RandomForest": RandomForestClassifier(
    #     n_estimators=200,
    #     random_state=42,
    #     class_weight="balanced"
    # ),

    "DecisionTree": DecisionTreeClassifier(
        random_state=42,
        class_weight="balanced"
    )
}


In [22]:
results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name):

        for param, value in model.get_params().items():
            mlflow.log_param(param, value)

        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Precision", precision)
        mlflow.log_metric("Recall", recall)
        mlflow.log_metric("F1_score", f1)

        mlflow.sklearn.log_model(model, artifact_path=name)

        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1_score": f1
        }




üèÉ View run LogisticRegression at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0/runs/ed56d496fb0640be8d5e8393791179c9
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0




üèÉ View run LinearSVM at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0/runs/287ae04222d14ef1ba57689d4dfede26
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0




üèÉ View run DecisionTree at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0/runs/07180ea32e264957b0f40c69fce6f444
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/0


In [23]:
results

{'LogisticRegression': {'Accuracy': 0.7936104695919939,
  'Precision': 0.8879192103981397,
  'Recall': 0.7936104695919939,
  'F1_score': 0.8286393699838063},
 'LinearSVM': {'Accuracy': 0.8678214010777521,
  'Precision': 0.8796193376387028,
  'Recall': 0.8678214010777521,
  'F1_score': 0.8732386347961533},
 'DecisionTree': {'Accuracy': 0.8029253271747498,
  'Precision': 0.8315671854148471,
  'Recall': 0.8029253271747498,
  'F1_score': 0.8161306010918156}}

## Using Bag of Word method

In [24]:
dagshub.init(
    repo_owner='ay747283',
    repo_name='HealthKart-recommendation-system',
    mlflow=True
)

mlflow.set_tracking_uri(
    'https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow'
)

mlflow.set_experiment('Text Classification - BOW')


2025/12/24 22:36:32 INFO mlflow.tracking.fluent: Experiment with name 'Text Classification - BOW' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/5c0d2a7e657c4ac691cf175a138d2a67', creation_time=1766595991632, experiment_id='1', last_update_time=1766595991632, lifecycle_stage='active', name='Text Classification - BOW', tags={}>

In [25]:
bow = CountVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = bow.fit_transform(X_train)
X_test_vec = bow.transform(X_test)


In [26]:
results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name):

        for param, value in model.get_params().items():
            mlflow.log_param(param, value)

        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Precision", precision)
        mlflow.log_metric("Recall", recall)
        mlflow.log_metric("F1_score", f1)

        mlflow.sklearn.log_model(model, artifact_path=name)

        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1_score": f1
        }




üèÉ View run LogisticRegression at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1/runs/59bbf987e820400e9c2543a6b9f162b8
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1




üèÉ View run LinearSVM at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1/runs/44a07efbc3e14fc58a186e5c87abc02e
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1




üèÉ View run DecisionTree at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1/runs/1d5b86f202d0435b96f579fc9cd545af
üß™ View experiment at: https://dagshub.com/ay747283/HealthKart-recommendation-system.mlflow/#/experiments/1


In [27]:
results

{'LogisticRegression': {'Accuracy': 0.795535026943803,
  'Precision': 0.8772801489781608,
  'Recall': 0.795535026943803,
  'F1_score': 0.8275748747232259},
 'LinearSVM': {'Accuracy': 0.859122401847575,
  'Precision': 0.8712410015085064,
  'Recall': 0.859122401847575,
  'F1_score': 0.8648007766252975},
 'DecisionTree': {'Accuracy': 0.8056197074672825,
  'Precision': 0.8395301610380443,
  'Recall': 0.8056197074672825,
  'F1_score': 0.8209466106000932}}