<a href="https://colab.research.google.com/github/Almonfrey/MAI-Course/blob/main/class_8_log_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, accuracy_score, log_loss
from sklearn.model_selection import train_test_split

Load Data

In [None]:
# Download data
url = "https://raw.githubusercontent.com/Almonfrey/MAI-Course/main/data/winequality-white.csv"
df = pd.read_csv(url)

print("Downloaded Data")
df[0:3]

Downloaded Data


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


Target Transformation

In [None]:
# Target transformation
df['quality'] = df['quality'].apply(lambda x: 0 if x < 6 else 1)

Split Dataset

In [None]:
# Splitting data
train_set, temp_set = train_test_split(df, test_size=0.3, random_state=42)
val_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42)

print(f"Train set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

Train set size: 3428
Validation set size: 735
Test set size: 735


Build Preprocess Pipeline

In [None]:
# Create the pipeline
preprocess = Pipeline([
    ("standardize", StandardScaler())
])

Include Predictor and Train Model

In [None]:
# Include predictor in the pipeline
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("classifier", LogisticRegression())
])
# Fit log_reg
y_train = train_set['quality']
X_train = train_set.drop('quality', axis=1)
pipeline.fit(X_train, y_train)

Metrics

Train set Evaluation

In [None]:
# Predict on training data
y_train_pred = pipeline.predict(X_train)
y_train_proba = pipeline.predict_proba(X_train)[:, 1]  # Probability for AUC

# Evaluation metrics (use y_train, not y_val!)
accuracy_train = accuracy_score(y_train, y_train_pred)
loss_train = log_loss(y_train, y_train_proba)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
auc_train = roc_auc_score(y_train, y_train_proba)
f1_train = f1_score(y_train, y_train_pred)

print(f"Accuracy Train: {accuracy_train:.3f}")
print(f"Log loss Train: {loss_train:.3f}")
print(f"Precision Train: {precision_train:.3f}")
print(f"Recall Train: {recall_train:.3f}")
print(f"AUC Train: {auc_train:.3f}")
print(f"F1 Score Train: {f1_train:.3f}")

Accuracy Train: 0.754
Log loss Train: 0.505
Precision Train: 0.780
Recall Train: 0.873
AUC Train: 0.802
F1 Score Train: 0.824


Validation Set Evaluation

In [None]:
# Predict on validation data
y_val = val_set['quality']
X_val = val_set.drop('quality', axis=1)
y_val_pred = pipeline.predict(X_val)
y_val_proba = pipeline.predict_proba(X_val)[:, 1]

accuracy_val = accuracy_score(y_val, y_val_pred)
loss_val = log_loss(y_val, y_val_proba)
precision_val = precision_score(y_val, y_val_pred)
recall_val = recall_score(y_val, y_val_pred)
auc_val = roc_auc_score(y_val, y_val_proba)
f1_val = f1_score(y_val, y_val_pred)

print(f"\nAccuracy Val: {accuracy_val:.3f}")
print(f"Log loss Val: {loss_val:.3f}")
print(f"Precision Val: {precision_val:.3f}")
print(f"Recall Val: {recall_val:.3f}")
print(f"AUC Val: {auc_val:.3f}")
print(f"F1 Score Val: {f1_val:.3f}")


Accuracy Val: 0.754
Log loss Val: 0.499
Precision Val: 0.784
Recall Val: 0.872
AUC Val: 0.805
F1 Score Val: 0.825


Test Set Evaluation

In [None]:
# Predict on testing data
y_test = test_set['quality']
X_test = test_set.drop('quality', axis=1)
y_test_pred = pipeline.predict(X_test)
y_test_proba = pipeline.predict_proba(X_test)[:, 1]

accuracy_test = accuracy_score(y_test, y_test_pred)
loss_test = log_loss(y_test, y_test_proba)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
auc_test = roc_auc_score(y_test, y_test_proba)
f1_test = f1_score(y_test, y_test_pred)

print(f"\nAccuracy Test: {accuracy_test:.3f}")
print(f"Log loss Test: {loss_test:.3f}")
print(f"Precision Test: {precision_test:.3f}")
print(f"Recall Test: {recall_test:.3f}")
print(f"AUC Test: {auc_test:.3f}")
print(f"F1 Score Test: {f1_test:.3f}")


Accuracy Test: 0.755
Log loss Test: 0.507
Precision Test: 0.794
Recall Test: 0.870
AUC Test: 0.787
F1 Score Test: 0.830
