In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import xgboost as xgb
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
# Binarize the target variable
data['quality'] = [1 if x >= 7 else 0 for x in data['quality']]

# Split the data into training and test sets
x_data = data.drop('quality', axis=1)
y_data = data['quality']
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    over = SMOTE(sampling_strategy=0.2, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    # pipeline SMOTE Oversampling
    steps = [('over', over), ('under', under), ('model', xgb.XGBClassifier())]
    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
    score = mean(scores)
    print(k,'%.3f' % score)

1 0.872
2 0.874
3 0.872
4 0.867
5 0.871
6 0.876
7 0.871


In [2]:
# SVM SMOTE
from imblearn.over_sampling import SVMSMOTE
over = SVMSMOTE()
# pipeline 
steps = [('over', over), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score:%.3f' % score)

F1 Score:0.899


In [3]:
# ADAYSN
from imblearn.over_sampling import ADASYN
over = ADASYN()
# pipeline 
steps = [('over', over), ('model', xgb.XGBClassifier())]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x_data, y_data, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score:%.3f' % score)

F1 Score:0.893
