In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

In [11]:
df = pd.read_csv('dataset.csv')

In [13]:
df['Target'] = (df['Success_Percentage'] > 50).astype(int)

In [15]:
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])
df['Sub_category'] = le.fit_transform(df['Sub_category'])

features = ['Price', 'Rating', 'No_rating', 'Discount', 'M_Spend', 'Supply_Chain_E', 
            'Sales_y', 'Sales_m', 'Market_T', 'Seasonality_T', 'Category', 'Sub_category']

X = df[features]
y = df['Target']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
def evaluate(name, model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    print(f"--- {name} Results ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"AUC: {roc_auc_score(y_test, y_prob):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1: {f1_score(y_test, y_pred):.4f}")
    print(f"MCC: {matthews_corrcoef(y_test, y_pred):.4f}\n")

In [19]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
evaluate("Logistic Regression", lr_model)

--- Logistic Regression Results ---
Accuracy: 0.9900
AUC: 1.0000
Precision: 1.0000
Recall: 0.9762
F1: 0.9880
MCC: 0.9796



In [20]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=5)
dt_model.fit(X_train, y_train)
evaluate("Decision Tree", dt_model)

--- Decision Tree Results ---
Accuracy: 0.8400
AUC: 0.8243
Precision: 0.8095
Recall: 0.8095
F1: 0.8095
MCC: 0.6716



In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
evaluate("K-Nearest Neighbor", knn_model)

--- K-Nearest Neighbor Results ---
Accuracy: 0.8200
AUC: 0.9013
Precision: 0.8158
Recall: 0.7381
F1: 0.7750
MCC: 0.6278



In [31]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
evaluate("Naive Bayes", nb_model)

--- Naive Bayes Results ---
Accuracy: 0.9500
AUC: 0.9914
Precision: 0.9750
Recall: 0.9070
F1: 0.9398
MCC: 0.8988



In [24]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
evaluate("Random Forest", rf_model)

--- Random Forest Results ---
Accuracy: 0.8900
AUC: 0.9520
Precision: 0.9189
Recall: 0.8095
F1: 0.8608
MCC: 0.7747



In [25]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
evaluate("XGBoost", xgb_model)

--- XGBoost Results ---
Accuracy: 0.9200
AUC: 0.9750
Precision: 0.9048
Recall: 0.9048
F1: 0.9048
MCC: 0.8358



In [32]:
import joblib
import os

if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(lr_model, 'model/logistic.pkl')
joblib.dump(dt_model, 'model/decision_tree.pkl')
joblib.dump(knn_model, 'model/knn.pkl')
joblib.dump(rf_model, 'model/random_forest.pkl')
joblib.dump(nb_model, 'model/naive_bayes.pkl')
joblib.dump(xgb_model, 'model/xgboost.pkl')
print("All models saved in /model folder!")

All models saved in /model folder!
