In [1]:

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
import numpy as np
import statistics
from sklearn.metrics import f1_score
import xgboost as xgb
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
# Binarize the target variable
data['quality'] = [1 if x >= 7 else 0 for x in data['quality']]

# Split the data into training and test sets
x_data = data.drop('quality', axis=1)
y_data = data['quality']
### ATTENTION Data Leakage, Calculation from pos_weight is taken from whole Dataset
# Calculate Pos_weights from class imbalance:
pos_weight = float(y_data.value_counts()[0]) / y_data.value_counts()[1]
pos_weight_list = [pos_weight * p / 100 for p in [25, 50, 75, 100, 125, 150, 175, 200, 300]]


# Convert to numpy for fast calculation
x_data = ndarray = x_data.to_numpy()
y_data = ndarray = y_data.to_numpy()


In [2]:
# Define CV
#kf =  RepeatedKFold(n_splits=10, n_repeats=6)
kf = KFold(n_splits=10)
# Scoring Function
scoring = 'f1'

f1_scores_all = []
f1_scores_std = []
for pos_weight in pos_weight_list:
    f1_scores = []
    model = xgb.XGBClassifier(scale_pos_weight=pos_weight)
    for train_index, test_index in kf.split(x_data):
        X_train, X_test = x_data[train_index], x_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]
        model.fit(X_train, y_train)
        # Make Prediction
        y_pred = model.predict(X_test)
        results = f1_score(y_test, y_pred, average='weighted')
        f1_scores.append(results)
    f1_scores_all.append(statistics.mean(f1_scores))
    f1_scores_std.append(statistics.stdev(f1_scores))
results_df = pd.DataFrame({
    'Pos_weight': pos_weight_list,
    'F1 Score': f1_scores_all,
    'F1 Std': f1_scores_std,
})


print(results_df)



   Pos_weight  F1 Score    F1 Std
0    1.592166  0.868207  0.077945
1    3.184332  0.860613  0.073390
2    4.776498  0.865022  0.067138
3    6.368664  0.861176  0.076750
4    7.960829  0.859048  0.071296
5    9.552995  0.862301  0.069359
6   11.145161  0.862459  0.073061
7   12.737327  0.857879  0.073041
8   19.105991  0.862613  0.067225
