In [25]:
# STEP 1: Import required library
import pandas as pd

# STEP 2: Define dataset URLs (UCI direct links)

red_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
white_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

# STEP 3: Load datasets directly from URL

red = pd.read_csv(red_url, sep=';')
white = pd.read_csv(white_url, sep=';')


# STEP 4: Save locally in Colab (so you can reuse without re-downloading)
red.to_csv("winequality-red.csv", index=False)
white.to_csv("winequality-white.csv", index=False)

print("Datasets downloaded and saved successfully!")

# STEP 5: Check shape
print("Red Wine Shape:", red.shape)
print("White Wine Shape:", white.shape)

# STEP 6: Preview data
red.head()


Datasets downloaded and saved successfully!
Red Wine Shape: (1599, 12)
White Wine Shape: (4898, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [26]:
# we need to merge the dataset and create classification target

import pandas as pd


# Combine datasets
data = pd.concat([red, white], axis=0, ignore_index=True)

# Check columns BEFORE conversion
print("Columns before conversion:", data.columns)

# Convert quality into Binary Classification
data['quality'] = (data['quality'] >= 7).astype(int)
# Confirm column exists
print("Columns after conversion:", data.columns)
print(data['quality'].value_counts())

# Save final dataset
data.to_csv("winequality_combined_binary.csv", index=False)

print("Final dataset saved!")


print("Dataset Shape:", data.shape)
data.head()


import os
os.getcwd()





Columns before conversion: Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
Columns after conversion: Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
quality
0    5220
1    1277
Name: count, dtype: int64
Final dataset saved!
Dataset Shape: (6497, 12)


'/content'

In [27]:
# Let's split and Scale


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop('quality', axis=1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [28]:
# Let's train the dataset for all 6 models to calculate metrics as asked

!pip install xgboost

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib



models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}



results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]

    results.append([
        name,
        accuracy_score(y_test, preds),
        roc_auc_score(y_test, probs),
        precision_score(y_test, preds),
        recall_score(y_test, preds),
        f1_score(y_test, preds),
        matthews_corrcoef(y_test, preds)
    ])

    joblib.dump(model, f"{name}.pkl")


columns = ["ML Model Name","Accuracy","AUC","Precision","Recall","F1","MCC"]
results_df = pd.DataFrame(results, columns=columns)

results_df.to_csv("model_comparison.csv", index=False)
results_df








Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.824615,0.812144,0.607143,0.269841,0.373626,0.321015
1,Decision Tree,0.84,0.762117,0.57971,0.634921,0.606061,0.506759
2,kNN,0.832308,0.822612,0.587629,0.452381,0.511211,0.417198
3,Naive Bayes,0.760769,0.775612,0.422977,0.642857,0.510236,0.374593
4,Random Forest,0.893846,0.92559,0.820225,0.579365,0.67907,0.631121
5,XGBoost,0.889231,0.911218,0.757143,0.630952,0.688312,0.625455


In [29]:
import os
os.getcwd()
os.listdir()

['.config',
 'Random Forest.pkl',
 'winequality-white.csv',
 'Logistic Regression.pkl',
 'Naive Bayes.pkl',
 'kNN.pkl',
 'winequality-red.csv',
 'XGBoost.pkl',
 'scaler.pkl',
 'winequality_combined_binary.csv',
 'model_comparison.csv',
 'Decision Tree.pkl',
 'sample_data']

In [30]:


results_df.to_csv("model_comparison.csv", index=False)
print("File saved successfully!")
import os
os.getcwd()





File saved successfully!


'/content'

In [31]:
# For stremlit we need scalar
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [32]:
# Let's create app.py for Streamlit
!pip install streamlit
import streamlit as st
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

st.title("Wine Quality Classification")

uploaded_file = st.file_uploader("Upload Test Dataset", type=["csv"])

model_name = st.selectbox("Select Model",
("Logistic Regression","Decision Tree","kNN","Naive Bayes","Random Forest","XGBoost"))


if uploaded_file is not None:
    #data = pd.read_csv(uploaded_file, sep=';')

  # Auto-detect separator
    data = pd.read_csv(uploaded_file, sep=None, engine='python')


    # It's good practice to show columns detected for debugging/user feedback
   # st.write("Columns detected:", data.columns)

    if 'quality' not in data.columns:
        st.error("Uploaded file must contain 'quality' column.")
        st.stop()

    X = data.drop('quality', axis=1)
    y = (data['quality'] >= 7).astype(int)

    # Map dropdown names → actual file names (ensure these match the saved files)
    model_files = {
        "Logistic Regression": "logistic_regression.pkl",
        "Decision Tree": "decision_tree.pkl",
        "kNN": "knn.pkl",
        "Naive Bayes": "naive_bayes.pkl",
        "Random Forest": "random_forest.pkl",
        "XGBoost": "xgboost.pkl"
    }

    # Load scaler and selected model AFTER a file has been uploaded
    scaler = joblib.load("scaler.pkl")
    model = joblib.load(model_files[model_name])
     #   model = joblib.load(f"{model_name}.pkl")

    # Scale the input features
    X = scaler.transform(X)

    # Make predictions
    preds = model.predict(X)

    st.subheader("Classification Report")
    st.text(classification_report(y, preds))

    st.subheader("Confusion Matrix")
    cm = confusion_matrix(y, preds)
    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    st.pyplot(fig)
    st.subheader("Model Comparison Results")

st.subheader("Below table for Model Comparison/Evaluation Results. If you want to get the Confusion matrix or classification report then pls Browse the dataset like 1.winequality-white.csv or winequality-red.csv or winequality_combined_binary.csv file ")

results = pd.read_csv("model_comparison.csv")
st.dataframe(results)







DeltaGenerator()

In [34]:
#wine-quality-ML-Assignment_Two/
#│-- app.py
#│-- requirements.txt
#│-- README.md
#|-- model/train_models.py
#|-- model/2025ab05161_ML_Assignment.ipynb
#│-- 2025ab05161_ML_Assignment2_Screenshot.png
#│-- model_comparison.csv
#│-- scaler.pkl
#│-- logistic_regression.pkl
#│-- decision_tree.pkl
#│-- knn.pkl
#│-- naive_bayes.pkl
#│-- random_forest.pkl
#│-- xgboost.pkl
#│-- winequality-white.csv
#│-- winequality-red.csv
#│-- winequality_combined_binary.csv
