In [None]:
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import entropy, randint, uniform
from joblib import dump

In [None]:
adidas_data = pd.read_csv('AdidasMatchday20240419.csv')

categorical_features = ['sports_categories', 'gender_last']
numerical_features = ['insole_length', 'ball_girth', 'ball_width', 'heel_width', 'toe_height', 'heel_spring']

# Create a target variable as mean between too_large and too_small return rates
adidas_data['target'] = adidas_data[['too_large', 'too_small']].mean(axis=1)
# Create a strata column combining both categorical columns
# to preserve percentage in cross validation
adidas_data['strata'] = adidas_data['sports_categories'] + "_" + adidas_data['gender_last']

In [None]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
param_dist = {
    'xgb_classifier__n_estimators': randint(50, 500),
    'xgb_classifier__max_depth': randint(3, 10),
    'xgb_classifier__max_leaves': randint(0, 256),
    'xgb_classifier__tree_method': ['hist', 'approx', 'exact'],
    'xgb_classifier__learning_rate': uniform(0.01, 0.2),
    'xgb_classifier__gamma': uniform(0, 0.5),
    'xgb_classifier__min_child_weight': randint(1, 10),
    'xgb_classifier__max_delta_step': randint(0, 10),
    'xgb_classifier__colsample_bytree': uniform(0.3, 0.7),
    'xgb_classifier__colsample_bylevel': uniform(0.3, 0.7),
    'xgb_classifier__colsample_bynode': uniform(0.3, 0.7),
}


In [None]:
# Objective: logistic regression, output probability
xgb_classifier = xgboost.XGBRegressor(
    objective='reg:logistic',
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_classifier', xgb_classifier)
])

In [None]:
# Setting up randomized fine-tuning with negative MAPE
random_search = RandomizedSearchCV(pipeline, n_iter=50000,
                                   param_distributions=param_dist,
                                   scoring='neg_mean_absolute_percentage_error',
                                   cv=skf.split(adidas_data, adidas_data['strata']),
                                   verbose=1, random_state=42, n_jobs=10)

random_search.fit(adidas_data.drop(columns='target'), adidas_data['target'])

print("Best parameters:", random_search.best_params_)
print("Best score (neg MAPE):", random_search.best_score_)

In [None]:
random_search.scorer_
print("Best parameters:", random_search.best_params_)
print("Best score (neg MAPE):", random_search.best_score_)

pipeline.set_params(**random_search.best_params_)
pipeline.fit(adidas_data.drop(columns='target'), adidas_data['target'])

# Save the model
dump(pipeline, 'my_pipeline.joblib')

make_scorer(mean_absolute_percentage_error, greater_is_better=False)