In [2]:

import pandas as pd
import numpy as np
from numpy import mean
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import xgboost as xgb
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
# Binarize the target variable
data['quality'] = [1 if x >= 7 else 0 for x in data['quality']]

# Split the data into training and test sets
x_data = data.drop('quality', axis=1)
y_data = data['quality']

# pipeline
steps = [('under', RandomUnderSampler()), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

F1 Score: 0.798


In [3]:
# Compare different samling methods
# Split the data into training and test sets
x_data = data.drop('quality', axis=1)
y_data = data['quality']
# define sampling parameter
over = RandomOverSampler(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.5)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Over Pipeline
steps = [('over', over), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score Over: %.3f' % score)
# Under Pipeline
steps = [('under', under), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score Under: %.3f' % score)
# Over/Under pipeline
steps = [('over', over),('under', under), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score Over/Under: %.3f' % score)

F1 Score Over: 0.909
F1 Score Under: 0.869
F1 Score Over/Under: 0.872
