# XG Boost Model

Try out the XG Boost Models with an ML Flow set-up.

In [45]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np

# Mlflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import xgboost as xgb
from xgboost import plot_importance, plot_tree
plt.style.use('fivethirtyeight')

# Model Evaluiation
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## Read and prepare the data

In [9]:
os.listdir('data')

['20191029_Master_GRW-Fördergebiet 2014-2020 (Kreise Status).xlsx',
 'autobahn',
 'description_of_vars.pdf',
 'df_final_with_dates.csv',
 'Kopie von 190320_Abgrenzung AMR nach 330. GRW-UAS_endgültig_mit Namen.xlsx',
 'monthly_toll_stats']

In [10]:
data = pd.read_csv('data\df_final_with_dates.csv')
data.shape

(18509, 210)

In [11]:
data.head()

Unnamed: 0,cluster,kreis,ags5,ags2,supermarkets_population,supermarkets_average_distance,public_transport_availability,average_distance_bus_stop,average_distance_train_station,average_distance_public_transport,...,grw_funding_framework,settlement_structure_type_of_labor_market_region,room_type_location,district_settlement_structure,type_of_settlement_structure,urban_/_rural,metropolitan_region,metropolitan_area,east_west,border_proximity
0,2,Regionalverband Saarbrücken,10041,10,76,734,52,260,1804,255,...,6,1,1,2,1,1,99,99,1,1
1,2,Regionalverband Saarbrücken,10041,10,76,734,52,260,1804,255,...,6,1,1,2,1,1,99,99,1,1
2,2,Regionalverband Saarbrücken,10041,10,76,734,52,260,1804,255,...,6,1,1,2,1,1,99,99,1,1
3,2,Regionalverband Saarbrücken,10041,10,76,734,52,260,1804,255,...,6,1,1,2,1,1,99,99,1,1
4,2,Regionalverband Saarbrücken,10041,10,76,734,52,260,1804,255,...,6,1,1,2,1,1,99,99,1,1


In [20]:
data['variable'].unique()

array(['unemployed', 'unemployment_rate',
       'underemployment_without_short_time _work',
       'unemployment_benefit_recipients', 'registerd_jobs',
       'unemployment_benefit_entitled',
       'employees_social_security_at_residence',
       'employees_social_security_at_residenceemployees_social_security_at_work',
       'displayed_short_time_work_companies',
       'displayed_short_time_work_people',
       'realized_short_time_work_companies',
       'realized_short_time_work_people'], dtype=object)

In [17]:
# Check out data columns
cols = list(data.columns)
cols.sort()
cols

['2018_men_population',
 '2018_population',
 '2018_population_10_to_15',
 '2018_population_15_to_18',
 '2018_population_18_to_20',
 '2018_population_20_to_25',
 '2018_population_25_to_30',
 '2018_population_30_to_35',
 '2018_population_35_to_40',
 '2018_population_3_to_6',
 '2018_population_40_to_45',
 '2018_population_45_to_50',
 '2018_population_50_to_55',
 '2018_population_55_to_60',
 '2018_population_60_to_65',
 '2018_population_65_to_75',
 '2018_population_6_to_10',
 '2018_population_above_75',
 '2018_population_under_3_yrs',
 '2018_women_population',
 '2019_population',
 '2019_population_15_to_35',
 '2019_population_35_to_60',
 '2019_population_5_to_15',
 '2019_population_60_to_80',
 '2019_population_above_80',
 '2019_population_under_5',
 'Commute_within_150km',
 'Commute_within_300km',
 'Commute_within_50km',
 'Commuter_Balance',
 'Number_of_Commuters_on_place_of_residence)',
 'Number_of_Commuters_place_of_work',
 'Proportion_of_in_commuters',
 'Relative_Commuter_Balance',
 'Re

In [14]:
data[['number_of_companies_agriculture_x', 'number_of_companies_agriculture_y']]

Unnamed: 0,number_of_companies_agriculture_x,number_of_companies_agriculture_y
0,82,97.0
1,82,97.0
2,82,97.0
3,82,97.0
4,82,97.0
...,...,...
18504,96,100.0
18505,96,100.0
18506,96,100.0
18507,96,100.0


### Dummy Example

In [23]:
# Import data 
data = pd.read_csv('data/df_stationary.csv')
data.shape

(401, 169)

In [25]:
data.head()

Unnamed: 0,kreis,ags5,ags2,supermarkets_population,supermarkets_average_distance,public_transport_availability,average_distance_bus_stop,average_distance_train_station,average_distance_public_transport,Unnamed:_0,...,median_income,purchasing_power_per_household,purchasing_power_per_person,debtor_quota,household_in_income_calss_1,household_in_income_calss_2,household_in_income_calss_3,household_in_income_calss_4,household_in_income_calss_5,household_in_income_calss_6
0,"Flensburg, Stadt",1001,1,92,500,35,240,2901,240,0,...,2986,34496,19556,16.0,13209,12385,13092,5441,3096,3497
1,"Kiel, Landeshauptstadt",1002,1,92,460,37,268,2037,265,1,...,3304,35246,19612,12.1,22126,38523,39483,18567,11162,7841
2,"Lübeck, Hansestadt",1003,1,90,532,37,297,1927,294,2,...,3036,37219,20820,15.1,22881,30159,32571,15805,10999,9039
3,"Neumünster, Stadt",1004,1,85,588,37,316,1648,313,3,...,2842,38141,19561,17.9,7208,13291,10435,3812,3201,2805
4,Dithmarschen,1051,1,51,1864,35,448,3517,443,4,...,2914,46945,23486,12.8,11806,11575,17816,9873,7834,7676


In [28]:
# Create X and y 
X = data.drop(['kreis', 'ags5', 'ags2', 'supermarkets_population'], axis=1)
y = data['supermarkets_population']

## Set up ML Flow Training Function

In [57]:
def train_xgb(X, y, params, run_name='xgb_model_run'):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("X Train Size:", X_train.shape)
    print("X Test Size:", X_test.shape)
    print("y Train Size:", y_train.shape)
    print("y Train Size:", y_test.shape)
    
    print("\n")

    with mlflow.start_run(run_name=run_name):

        reg = xgb.XGBRegressor(**params)
        reg.fit(X_train, y_train)
        
        # Calculate prediction eval metrics
        mse = mean_squared_error(y_test, reg.predict(X_test))
        r2 = r2_score(y_test, reg.predict(X_test))
        
        print("Model Run Statistics")
        print(f"MSE: {mse}")
        print(f"R2 Score: {r2}")
        
        # Log params
        mlflow.log_params(params)
        mlflow.log_param('X_vars', str(list(X.columns)))
        
        # Log metrics
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
              

        mlflow.xgboost.log_model(reg, "model")
    
    return reg

In [58]:
params_1 = {'n_estimators': 500,
            'max_depth':4, 
            'min_samples_split':5,
            'learning_rate': 0.01, 
            'loss':'ls', 
            'verbosity':1}
train_xgb(X, y, params_1, run_name='test_run2')

X Train Size: (320, 165)
X Test Size: (81, 165)
y Train Size: (320,)
y Train Size: (81,)


Parameters: { "loss", "min_samples_split" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Model Run Statistics
MSE: 13.355372910573935
R2 Score: 0.9378049599634064


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, loss='ls', max_delta_step=0, max_depth=4,
             min_child_weight=1, min_samples_split=5, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=1)