# MODELLING

In [1]:
# general sys modules / libraries
import sys
import warnings  
warnings.filterwarnings('ignore') 

# data analysis and visualisation modules / libraries
import numpy as np
from numpy import loadtxt

import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter


import seaborn as sns

import scipy.stats as scs
from scipy import stats

# machine learning modules / libraries
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
from xgboost import XGBRegressor

from imblearn.over_sampling import RandomOverSampler

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
# Read dataset
data = 'datasets/train_model.csv'
train_model= pd.read_csv(data)

# Finetuning of the Hyper Parameters
***

In [3]:
!pip -q install shap
import shap

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [4]:
import xgboost
import shap

dataset = train_model
# training an XGBoost model
X = train_model.drop(['target'], axis=1)
y = train_model['target']

In [5]:
os = RandomOverSampler(sampling_strategy=1)

In [6]:
X_train, y_train = os.fit_resample(X,y)

In [7]:
# splitting data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [8]:
# finetuning the hyper parameters of xgboost 
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5]
    }

    xgb_model = XGBRegressor()

    parameter_search = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    parameter_search.fit(X_train,y_train)

    return parameter_search.best_params_

In [9]:
# To start the tuning remove the '#' in the next line // ATTENTION: the calculation can take a few minutes
hyperParameterTuning(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3}

OUTCOME OF Hyper Parameter Tuning: 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3

In [10]:
# fit model to training data
def show_model():
    model = XGBClassifier(learning_rate=0.1, max_depth= 3, min_child_weight= 3).fit(X_train, y_train)
    print(model)

In [11]:
show_model()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


# Evaluation of the Model
***

In [12]:
model = XGBClassifier(learning_rate=0.1, max_depth= 3, min_child_weight= 3).fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]



In [13]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 78.77%


In [14]:
XGB = XGBClassifier(
                learning_rate=0.1, 
                max_depth=3,
                min_child_weight=3
                ).fit(X_train, y_train)

def append_modell_performance(performance, model_name, classifier, X_train, y_train, cv=5):

    # Calculate cross-validation mean performance scores
    cv_scores = cross_validate(classifier, X_train, y_train, cv=cv, return_train_score=True)

    # Append performance dictionary with 
    performance = performance.append({
                            'Model': model_name,
                            'Performance (mean) in Train': cv_scores['train_score'].mean(),
                            'Performance (std) in  Train': cv_scores['train_score'].std(),
                            'Performance (mean) in Test': cv_scores['test_score'].mean(),
                            'Performance (std) in  Test': cv_scores['test_score'].std()
                            }, ignore_index=True)
    return performance

# Create performance overview
performance_table = pd.DataFrame(columns=['Model', 'Performance (mean) in Train','Performance (std) in  Train', 'Performance (mean) in Test','Performance (std) in  Test'])
    
# Define XGBoost and benchmark models
models = [
        {
        'name': 'XGB_tuned',
        'model': XGB,
        },
        {
        'name': 'XGB_standard',
        'model': XGBClassifier(),
        },
        {
        'name': 'Random_Model',
        'model': DummyClassifier(strategy='stratified'),
        }]
 
for model in models:
    performance_table = append_modell_performance(performance_table, 
                                                model['name'], 
                                                model['model'], 
                                                X, y)

# Get an ordered table summarizing the model performances   
performance_table = performance_table.set_index('Model') 
performance_table.sort_values(by='Performance (mean) in Test', ascending=False)



Unnamed: 0_level_0,Performance (mean) in Train,Performance (std) in Train,Performance (mean) in Test,Performance (std) in Test
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGB_tuned,0.807025,0.002166,0.797435,0.005753
XGB_standard,0.876624,0.001601,0.788997,0.005075
Random_Model,0.62722,0.001957,0.636172,0.009061
