<a href="https://colab.research.google.com/github/Addy48/23FE10CSE00457_MLWine/blob/main/ML_LAB_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [16]:
# Load dataset
df = pd.read_csv('/content/winequality-white.csv', sep=';')
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [17]:
# Basic statistics
df.describe()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [18]:
# Skewness and Kurtosis
pd.DataFrame({
    'Skewness': df.skew(),
    'Kurtosis': df.kurtosis()
})


Unnamed: 0,Skewness,Kurtosis
fixed acidity,0.647751,2.172178
volatile acidity,1.57698,5.091626
citric acid,1.28192,6.174901
residual sugar,1.077094,3.46982
chlorides,5.023331,37.5646
free sulfur dioxide,1.406745,11.466342
total sulfur dioxide,0.39071,0.571853
density,0.977773,9.793807
pH,0.457783,0.530775
sulphates,0.977194,1.59093


In [19]:
# Correlation with target
df.corr()['quality'].sort_values(ascending=False)


Unnamed: 0,quality
quality,1.0
alcohol,0.435575
pH,0.099427
sulphates,0.053678
free sulfur dioxide,0.008158
citric acid,-0.009209
residual sugar,-0.097577
fixed acidity,-0.113663
total sulfur dioxide,-0.174737
volatile acidity,-0.194723


In [20]:
print("DataFrame type:", type(df))
print("\nColumn data types:\n", df.dtypes)
# Drop string/object columns if any
non_numeric_cols = df.select_dtypes(include=['object']).columns
df = df.drop(columns=non_numeric_cols)


DataFrame type: <class 'pandas.core.frame.DataFrame'>

Column data types:
 fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object


In [21]:
# Convert quality into classes
# Low: <=4, Medium: 5â€“6, High: >=7
df['quality_label'] = df['quality'].apply(
    lambda x: 'Low' if x <= 4 else 'Medium' if x <= 6 else 'High'
)

df[['quality', 'quality_label']].head()


Unnamed: 0,quality,quality_label
0,6,Medium
1,6,Medium
2,6,Medium
3,6,Medium
4,6,Medium


In [22]:
X = df.drop(['quality', 'quality_label'], axis=1)
y = df['quality_label']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [23]:
selector = SelectKBest(score_func=f_classif, k=8)
X_selected = selector.fit_transform(X_scaled, y)

selected_features = X.columns[selector.get_support()]
X_selected = pd.DataFrame(X_selected, columns=selected_features)

selected_features


Index(['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides',
       'free sulfur dioxide', 'total sulfur dioxide', 'density', 'alcohol'],
      dtype='object')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)


In [29]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

Accuracy: 0.7489795918367347

Classification Report:
               precision    recall  f1-score   support

        High       0.56      0.25      0.35       227
         Low       0.00      0.00      0.00        30
      Medium       0.77      0.94      0.85       723

    accuracy                           0.75       980
   macro avg       0.44      0.40      0.40       980
weighted avg       0.70      0.75      0.70       980



array([[ 57,   0, 170],
       [  1,   0,  29],
       [ 44,   2, 677]])

In [32]:
# Get class names from the logistic regression model
class_names = log_reg.classes_

# Create a DataFrame for coefficients, transposing to have features as index and classes as columns
coef_matrix = pd.DataFrame(log_reg.coef_.T, index=selected_features, columns=[f'Coefficient_{c}' for c in class_names])

# Create a DataFrame for intercepts, with 'Intercept' as index and classes as columns
intercept_matrix = pd.DataFrame([log_reg.intercept_], index=['Intercept'], columns=[f'Coefficient_{c}' for c in class_names])

# Concatenate the intercept and coefficient DataFrames
final_coef_df = pd.concat([intercept_matrix, coef_matrix])
final_coef_df

Unnamed: 0,Coefficient_High,Coefficient_Low,Coefficient_Medium
Intercept,0.134829,-1.881824,1.746995
fixed acidity,-0.103742,0.167593,-0.063851
volatile acidity,-0.431967,0.516247,-0.08428
residual sugar,0.472348,-0.70131,0.228962
chlorides,-0.230355,0.118455,0.111901
free sulfur dioxide,0.290134,-0.349685,0.059551
total sulfur dioxide,0.02278,-0.151037,0.128257
density,-0.295838,0.477637,-0.181799
alcohol,0.704977,-0.448798,-0.256179
