In [1]:
import sklearn
import xgboost
import pandas as pd
import scipy
import numpy as np
import _pickle as cPickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    auc,
    accuracy_score,
    confusion_matrix,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    train_test_split,
)
import xgboost as xgb
from scipy.stats import uniform, randint


In [2]:
with open(r"../Data/test_optimize.pickle", "rb") as inputfile:
    df = cPickle.load(inputfile)


In [3]:
y = df.pop("FPL_points")
df = df.T.reset_index(drop=True).T

numeric_features = df.T[2:].T.columns.to_list()
numeric_transformer = StandardScaler()

categorical_features = df.T[:2].T.columns.to_list()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


X = df[categorical_features+numeric_features]

print(X.shape)


preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

model = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor())
    ]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


(532, 131)


In [4]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))



In [5]:
params = {
    'model__colsample_bytree' : uniform(0.7, 0.3),
    'model__gamma' : uniform(0, 0.5),
    'model__learning_rate' : uniform(0.03, 0.3),
    'model__max_depth' : randint(2, 6),
    'model__n_estimators' : randint(100, 150),
    'model__subsample' : uniform(0.6, 0.4)
}


search = RandomizedSearchCV(model, param_distributions=params, random_state=3791, n_iter=1000, cv=10, verbose=1, return_train_score=True)

search.fit(X_train, y_train)

report_best_scores(search.cv_results_)


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits
Model with rank: 1
Mean validation score: 0.500 (std: 0.203)
Parameters: {'model__colsample_bytree': 0.867380551004832, 'model__gamma': 0.03592961391047372, 'model__learning_rate': 0.040134915053390535, 'model__max_depth': 2, 'model__n_estimators': 108, 'model__subsample': 0.9109644195547812}

Model with rank: 2
Mean validation score: 0.495 (std: 0.201)
Parameters: {'model__colsample_bytree': 0.7426914163587637, 'model__gamma': 0.19740457924348037, 'model__learning_rate': 0.08368080320769633, 'model__max_depth': 2, 'model__n_estimators': 102, 'model__subsample': 0.6298810235374472}

Model with rank: 3
Mean validation score: 0.495 (std: 0.196)
Parameters: {'model__colsample_bytree': 0.7943815704995927, 'model__gamma': 0.4353718179442924, 'model__learning_rate': 0.032437790762516544, 'model__max_depth': 2, 'model__n_estimators': 118, 'model__subsample': 0.8120542441705603}



In [6]:
y_pred = search.best_estimator_.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mse


1024.623803328373

In [7]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

32.00974544304239

In [8]:
r2 = r2_score(y_test, y_pred)
r2

0.5561560230398466

In [9]:
check = pd.DataFrame([y_pred, y_test], columns=y_test.index, index=['predict', 'test']).T

In [10]:
check.sort_values(by='test', ascending=False)

Unnamed: 0,predict,test
318,99.613640,154.0
501,76.482697,144.0
66,106.940025,141.0
155,79.218300,135.0
518,118.022896,134.0
...,...,...
144,42.344116,0.0
107,42.964386,0.0
441,65.967751,0.0
102,43.934017,0.0
