Classification Pipeline


---

In [1]:
import numpy as np, pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
pd.set_option('display.max_columns', 100)

In [2]:
# import datasets
directory = r'C:\Users\Daniel\Documents\Data\WineQuality'
df_red = pd.read_csv(f'{directory}\winequality-red.csv', delimiter = ';')
df_white = pd.read_csv(f'{directory}\winequality-white.csv', delimiter=';')

# label data
df_red['red'] = 1
df_white['red'] = 0

# combine data
df = pd.concat([df_white, df_red], axis=0)

# inspect
df.describe().round(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.22,0.34,0.32,5.44,0.06,30.53,115.74,0.99,3.22,0.53,10.49,5.82,0.25
std,1.3,0.16,0.15,4.76,0.04,17.75,56.52,0.0,0.16,0.15,1.19,0.87,0.43
min,3.8,0.08,0.0,0.6,0.01,1.0,6.0,0.99,2.72,0.22,8.0,3.0,0.0
25%,6.4,0.23,0.25,1.8,0.04,17.0,77.0,0.99,3.11,0.43,9.5,5.0,0.0
50%,7.0,0.29,0.31,3.0,0.05,29.0,118.0,0.99,3.21,0.51,10.3,6.0,0.0
75%,7.7,0.4,0.39,8.1,0.06,41.0,156.0,1.0,3.32,0.6,11.3,6.0,0.0
max,15.9,1.58,1.66,65.8,0.61,289.0,440.0,1.04,4.01,2.0,14.9,9.0,1.0


In [3]:
df.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0


In [4]:
# Class balance
pct_red = int(df['red'].mean() * 100)
print(f'Within the data {pct_red}% is red, there is an approx. {int(100 / pct_red -1)} to 1 imbalance')

Within the data 24% is red, there is an approx. 3 to 1 imbalance


In [5]:
round(100 * df.corr()[['quality', 'red']].abs().sort_values(by = 'quality', ascending=False))

Unnamed: 0,quality,red
quality,100.0,12.0
alcohol,44.0,3.0
density,31.0,39.0
volatile acidity,27.0,65.0
chlorides,20.0,51.0
red,12.0,100.0
citric acid,9.0,19.0
fixed acidity,8.0,49.0
free sulfur dioxide,6.0,47.0
total sulfur dioxide,4.0,70.0


In [6]:
# Pipeline 1 - classification
sampler = RandomOverSampler().fit_resample(df.iloc[:, :-1].copy(), df.iloc[:, -1].copy())

target = sampler[1]
features = sampler[0]

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.1)

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', AdaBoostClassifier())
])

pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       480
           1       0.99      0.99      0.99       500

    accuracy                           0.99       980
   macro avg       0.99      0.99      0.99       980
weighted avg       0.99      0.99      0.99       980

