In [1]:
#Cell 1: Import Required Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score,
    confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


In [2]:
#Cell 2: Load the Dataset
df = pd.read_csv("../data/wine-quality-white-and-red.csv")

# Display basic info
print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (6497, 13)


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
#Cell 3: Verify Dataset Properties (Very Important)
# Check columns
print("Columns:\n", df.columns)
# Check target classes
print("Quality classes:", sorted(df['quality'].unique()))
# Check missing values
print("\nMissing values per column:\n", df.isnull().sum())


Columns:
 Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')
Quality classes: [np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9)]

Missing values per column:
 type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [4]:
#Cell 4: Encode Categorical Feature (type)
# Encode wine type (red/white)
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Verify encoding
df[['type']].head()
print(df['type'].value_counts())

type
1    4898
0    1599
Name: count, dtype: int64


In [5]:
#Cell 5: Split Features and Target
X = df.drop('quality', axis=1)
y = df['quality']

print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)


Feature matrix shape: (6497, 12)
Target shape: (6497,)


In [6]:
#Cell 6: Train–Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (5197, 12)
Test size: (1300, 12)


In [7]:
#Cell 7: Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
# PHASE 1 – PART B: Model Training & Evaluation (Multiclass)
#Cell 8: Helper Function for Metrics (Multiclass-Safe)

def evaluate_model(model, X_test, y_test, y_pred, y_proba=None):
    metrics = {}
    
    metrics['Accuracy'] = float(accuracy_score(y_test, y_pred))
    metrics['Precision'] = float(
        precision_score(y_test, y_pred, average='macro', zero_division=0)
    )
    metrics['Recall'] = float(
        recall_score(y_test, y_pred, average='macro', zero_division=0)
    )
    metrics['F1'] = float(
        f1_score(y_test, y_pred, average='macro', zero_division=0)
    )
    metrics['MCC'] = float(matthews_corrcoef(y_test, y_pred))
    
    if y_proba is not None:
        metrics['AUC'] = float(
            roc_auc_score(
                y_test,
                y_proba,
                multi_class='ovr',
                average='macro'
            )
        )
    else:
        metrics['AUC'] = float('nan')
    
    return metrics


def round_metrics(metrics, decimals=4):
    return {k: round(v, decimals) for k, v in metrics.items()}


In [9]:
#Cell 9: Train Logistic Regression

lr = LogisticRegression(
    max_iter=1000,
    random_state=42
)

lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = lr.predict_proba(X_test_scaled)

lr_metrics = evaluate_model(lr, X_test_scaled, y_test, y_pred_lr, y_proba_lr)
lr_metrics
round_metrics(lr_metrics)
#print(round_metrics(lr_metrics))


{'Accuracy': 0.5385,
 'Precision': 0.2977,
 'Recall': 0.2243,
 'F1': 0.2261,
 'MCC': 0.2676,
 'AUC': 0.7823}

In [10]:
#Cell 10: Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)

dt_metrics = evaluate_model(dt, X_test, y_test, y_pred_dt, y_proba_dt)
dt_metrics
round_metrics(dt_metrics)


{'Accuracy': 0.6023,
 'Precision': 0.3283,
 'Recall': 0.3324,
 'F1': 0.3301,
 'MCC': 0.4075,
 'AUC': 0.6241}

In [11]:
#Cell 11: Train K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)
y_proba_knn = knn.predict_proba(X_test_scaled)

knn_metrics = evaluate_model(knn, X_test_scaled, y_test, y_pred_knn, y_proba_knn)
knn_metrics
round_metrics(knn_metrics)

{'Accuracy': 0.5369,
 'Precision': 0.3199,
 'Recall': 0.2577,
 'F1': 0.2667,
 'MCC': 0.2902,
 'AUC': 0.684}

In [12]:
#Cell 12: Train Naive Bayes (Gaussian)
nb = GaussianNB()

nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)
y_proba_nb = nb.predict_proba(X_test_scaled)

nb_metrics = evaluate_model(nb, X_test_scaled, y_test, y_pred_nb, y_proba_nb)
nb_metrics
round_metrics(nb_metrics)

{'Accuracy': 0.3515,
 'Precision': 0.2344,
 'Recall': 0.3425,
 'F1': 0.203,
 'MCC': 0.125,
 'AUC': 0.6901}

In [13]:
#Cell 13: Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)

rf_metrics = evaluate_model(rf, X_test, y_test, y_pred_rf, y_proba_rf)
rf_metrics
round_metrics(rf_metrics)

{'Accuracy': 0.6854,
 'Precision': 0.509,
 'Recall': 0.3529,
 'F1': 0.3836,
 'MCC': 0.5134,
 'AUC': 0.859}

In [14]:
# Cell 14: Train XGBoost (Multiclass)

#Create a LabelEncoder for quality to use in XGBoost 
from sklearn.preprocessing import LabelEncoder
y_encoder = LabelEncoder()
y_train_xgb = y_encoder.fit_transform(y_train)
y_test_xgb = y_encoder.transform(y_test)

#Original labels in dataset  y_test = [3, 4, 5, 6, 7, 8, 9]
#XGBoost internal encoding
#3 → 0
#4 → 1
#5 → 2
#6 → 3
#7 → 4
#8 → 5
#9 → 6

#Model prediction (internal)   y_pred_xgb_enc = [0, 2, 3, 4, ...]
#Decoded back  y_pred_xgb = y_encoder.inverse_transform(y_pred_xgb_enc)  [3, 5, 6, 7, ...]

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=7,
    eval_metric='mlogloss',
    random_state=42,
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6
)

xgb.fit(X_train, y_train_xgb)

y_pred_xgb_enc = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)

# Decode predictions back to original labels (3–9)
y_pred_xgb = y_encoder.inverse_transform(y_pred_xgb_enc)

xgb_metrics = evaluate_model(
    xgb,
    X_test,
    y_test,
    y_pred_xgb,
    y_proba_xgb
)

round_metrics(xgb_metrics)


{'Accuracy': 0.6454,
 'Precision': 0.4379,
 'Recall': 0.3367,
 'F1': 0.3607,
 'MCC': 0.4534,
 'AUC': 0.8342}

In [15]:
#Cell 15: Create Final Comparison Table
results_df = pd.DataFrame.from_dict({
    'Logistic Regression': lr_metrics,
    'Decision Tree': dt_metrics,
    'KNN': knn_metrics,
    'Naive Bayes': nb_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics
}, orient='index')

results_df


Unnamed: 0,Accuracy,Precision,Recall,F1,MCC,AUC
Logistic Regression,0.538462,0.297696,0.224253,0.226113,0.267623,0.782338
Decision Tree,0.602308,0.328325,0.332372,0.330148,0.407482,0.624064
KNN,0.536923,0.319904,0.257727,0.266702,0.290179,0.684003
Naive Bayes,0.351538,0.234388,0.342489,0.203014,0.124953,0.690147
Random Forest,0.685385,0.508956,0.352895,0.383618,0.513362,0.858961
XGBoost,0.645385,0.437864,0.336683,0.360742,0.453416,0.834169


In [16]:
#PHASE 1 – PART C
#Saving Models, Scaler, Encoder & Test Data
#Saving everything for Streamlit
#All 6 trained models
#Scaler
#Label encoder (for XGBoost)
#Test data (X_test, y_test)
#Final metrics table (optional)
#All model file will saving in Folder ml-classification-comparison/model/

#Cell 16: Import joblib (for saving objects)
import joblib

In [17]:
#Cell 17: Save the Scaler
joblib.dump(scaler, "../model/scaler.pkl")
print("Scaler saved")

Scaler saved


In [18]:
#Cell 18: Save Label Encoder (for XGBoost)
joblib.dump(y_encoder, "../model/label_encoder.pkl")
print("Label encoder saved")


Label encoder saved


In [19]:
#Cell 19: Save All Trained Models
joblib.dump(lr, "../model/logistic_regression.pkl")
joblib.dump(dt, "../model/decision_tree.pkl")
joblib.dump(knn, "../model/knn.pkl")
joblib.dump(nb, "../model/naive_bayes.pkl")
joblib.dump(rf, "../model/random_forest.pkl")
joblib.dump(xgb, "../model/xgboost.pkl")

print("All models saved successfully")


All models saved successfully


In [20]:
#Cell 20: Save Test Dataset (For Default Evaluation in Streamlit)
#If no CSV is uploaded → show internal test-set results
joblib.dump(X_test, "../model/X_test.pkl")
joblib.dump(y_test, "../model/y_test.pkl")

print("Test data saved")


Test data saved


In [21]:
#Cell 21 : Save Metrics Table
joblib.dump(results_df, "../model/results_df.pkl")
print("Metrics table saved")


Metrics table saved


In [22]:
#to verify if all the model file created or not - one time execution, so commented out
#import os

#os.listdir("../model")



In [23]:
#PHASE 2 — Run Streamlit LOCALLY