Classification Pipeline


---

In [1]:
# Libraries
import numpy as np, pandas as pd
pd.set_option('display.max_columns', 100)

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler

# Classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import BayesianRidge


In [2]:
# Data preparation
directory = r'C:\Users\Daniel\Documents\Data\WineQuality'
df_red = pd.read_csv(f'{directory}\winequality-red.csv', delimiter = ';')
df_white = pd.read_csv(f'{directory}\winequality-white.csv', delimiter=';')

# label data
df_red['red'] = 1
df_white['red'] = 0

# combine data
df = pd.concat([df_white, df_red], axis=0).reset_index().drop(columns='index')

# inspect
df.describe().round(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.22,0.34,0.32,5.44,0.06,30.53,115.74,0.99,3.22,0.53,10.49,5.82,0.25
std,1.3,0.16,0.15,4.76,0.04,17.75,56.52,0.0,0.16,0.15,1.19,0.87,0.43
min,3.8,0.08,0.0,0.6,0.01,1.0,6.0,0.99,2.72,0.22,8.0,3.0,0.0
25%,6.4,0.23,0.25,1.8,0.04,17.0,77.0,0.99,3.11,0.43,9.5,5.0,0.0
50%,7.0,0.29,0.31,3.0,0.05,29.0,118.0,0.99,3.21,0.51,10.3,6.0,0.0
75%,7.7,0.4,0.39,8.1,0.06,41.0,156.0,1.0,3.32,0.6,11.3,6.0,0.0
max,15.9,1.58,1.66,65.8,0.61,289.0,440.0,1.04,4.01,2.0,14.9,9.0,1.0


In [3]:
df.sample(4, random_state=0)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red
5316,11.9,0.38,0.51,2.0,0.121,7.0,20.0,0.9996,3.24,0.76,10.4,6,1
5210,9.0,0.46,0.31,2.8,0.093,19.0,98.0,0.99815,3.32,0.63,9.5,6,1
3518,7.5,0.2,0.41,1.2,0.05,26.0,131.0,0.99133,3.19,0.52,11.1,5,0
1622,6.5,0.44,0.49,7.7,0.045,16.0,169.0,0.9957,3.11,0.37,8.7,6,0


In [4]:
# Analyse class balance
pct_red = int(df['red'].mean() * 100)
print(f'Within the data {pct_red}% is red, there is an approx. {int(100 / pct_red -1)} to 1 imbalance')

Within the data 24% is red, there is an approx. 3 to 1 imbalance


In [5]:
# Data correlations
round(100 * df.corr()[['quality', 'red']].abs().sort_values(by = 'quality', ascending=False))

Unnamed: 0,quality,red
quality,100.0,12.0
alcohol,44.0,3.0
density,31.0,39.0
volatile acidity,27.0,65.0
chlorides,20.0,51.0
red,12.0,100.0
citric acid,9.0,19.0
fixed acidity,8.0,49.0
free sulfur dioxide,6.0,47.0
total sulfur dioxide,4.0,70.0


In [6]:
# Pipeline 1 - classification (red / white)
np.random.seed(42)
sampler = RandomOverSampler().fit_resample(df.iloc[:, :-1].copy(), df.iloc[:, -1].copy())

target = sampler[1]
features = sampler[0]

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.5)

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', AdaBoostClassifier())
])

pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2453
           1       0.99      0.99      0.99      2445

    accuracy                           0.99      4898
   macro avg       0.99      0.99      0.99      4898
weighted avg       0.99      0.99      0.99      4898



In [7]:
df['red pred'] = pipeline.predict(df.iloc[:, :-1].copy())

In [8]:
# Pipeline 2 - regression (quality labling)
np.random.seed(42)
from sklearn.cluster import MiniBatchKMeans

target = df['quality'].to_numpy()
features = df[['alcohol', 'density', 'volatile acidity', 'chlorides', 'red pred']].to_numpy()

sampler = RandomOverSampler().fit_resample(features, target)

target = sampler[1]
features = sampler[0]

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.5)

pipeline_2 = Pipeline([
    ('clustering', MiniBatchKMeans(n_clusters=8, batch_size=256)),
    ('scaling', MinMaxScaler()),
    ('polynomial', PolynomialFeatures(degree = 3)),
    ('regression', BayesianRidge())
])

pipeline_2.fit(x_train, y_train)
print(f'Regression accuracy: {round(100 * pipeline_2.score(x_test, y_test), 1)} %')

Regression accuracy: 60.0 %


In [9]:
df['qual pred'] = pipeline_2.predict(df[['alcohol', 'density', 'volatile acidity', 'chlorides', 'red pred']].to_numpy())
df['qual pred'] = [int(i) for i in df['qual pred']]

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red,red pred,qual pred
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0,0,5
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0,0,4
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0,0,4
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,0,4
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1,1,5
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1,1,5
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1,1,5
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1,1,4
