In [2]:
#import necessary modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.model_selection import train_test_split
from pprint import pprint
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# files

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
#import dataset for modeling
df = pd.read_csv(r'C:/Users/foeke/OneDrive/Documenten/submitting paper/All scripts - paper/data/GlobalModelData/PredictingDataset.csv', sep=',')
df

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,mean_value_NO2,nightlight_450,nightlight_3150,population_1000,population_3000,road_class_2_25,road_class_3_3000,road_class_3_300,trop_mean_filt_2019,BldDen100,NDVI,trafBuf25,trafBuf50
0,0,10.008293,48.397079,29.250500,22.585377,13.402575,8123.319336,54316.64453,0.000000,203502.42190,4167.064941,0.000058,1.000663,2310.0,0.000000,0.000000
1,1,10.021227,53.523262,32.098936,23.546194,22.862799,9558.207031,76786.28125,0.000000,99596.04688,1850.278076,0.000081,0.978245,2518.0,0.000000,0.000000
2,2,10.053859,53.592301,58.703744,18.290140,16.493486,10471.541020,88439.60938,46.359188,265563.71880,2016.838379,0.000076,0.914638,3216.0,674.791667,674.791667
3,3,10.081685,53.529020,24.249276,23.645823,18.248226,11165.945310,76961.00000,0.000000,97921.56250,478.363281,0.000074,0.712983,1052.0,0.000000,0.000000
4,4,10.096292,48.847917,19.912116,14.026642,5.514785,6017.511230,30814.58984,0.000000,176265.68750,3338.221191,0.000047,1.000641,3240.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,477,9.979031,48.396836,27.221808,29.130899,12.865678,10829.854490,54732.38281,0.000000,198280.70310,3127.346436,0.000058,0.964609,2607.0,0.000000,0.000000
478,478,9.981599,53.529158,32.763475,33.719387,36.813637,9855.304688,83610.52344,0.000000,117941.50000,1480.759766,0.000088,1.000696,,0.000000,0.000000
479,479,9.990569,53.507930,24.966811,17.919800,25.516184,9541.226562,76877.67188,0.000000,94117.64063,2670.007568,0.000082,0.485096,4338.0,0.000000,0.000000
480,480,9.997256,53.681993,39.256824,13.382681,7.228415,7302.048828,44593.98438,54.294807,171093.43750,2462.917969,0.000063,1.000694,4006.0,674.791667,674.791667


In [4]:
#remove unique idenifier & geodata
df = df.drop(['Unnamed: 0', 'Longitude', 'Latitude'], axis=1)
#replace NaN's with 0's
df = df.fillna(0)
#initialize response- and predictor variables
y = df['mean_value_NO2']#specify target
x = df.drop(['mean_value_NO2'], axis=1)#predictors
feature_list = list(x.columns)

## == CONSTRUCTING XGBoost == ##

##GENERATING CROSS VALIDATION (75% TRAINING OF MODEL;25% TESTING OF MODEL;EVALUATING MODEL PERFORMANCE)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, random_state=20)
#random state is needed to ensure that same results are generated each time.

## ENSURING THAT X- AND Y-TRAINING SET CONTAIN SAME AMOUNT OF ROWS. SAME FOR TESTING.
print('Training Features Shape:',X_train.shape)
print('Training Labels Shape:', Y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', Y_test.shape)

#construct model
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train, Y_train)

Training Features Shape: (361, 12)
Training Labels Shape: (361,)
Testing Features Shape: (121, 12)
Testing Labels Shape: (121,)


In [5]:
predicted_test = xg_reg.predict(X_test)
predicted_train = xg_reg.predict(X_train)

In [6]:
##RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

print('RMSE training: ',rmse(predicted_train, Y_train))
print('RMSE testing: ', rmse(predicted_test, Y_test))
##R2
print('R2 score training: ', r2_score(Y_train, predicted_train))
print('R2 score testing: ', r2_score(Y_test, predicted_test))
##MEAN ABSOLUTE ERROR 
from sklearn.metrics import mean_absolute_error
print('MAE training: ', mean_absolute_error(Y_train, predicted_train))
print('MAE testing: ', mean_absolute_error(Y_test, predicted_test))

RMSE training:  0.02433329723664231
RMSE testing:  7.832713864732703
R2 score training:  0.9999962648858242
R2 score testing:  0.6647178618098231
MAE training:  0.017617645377155607
MAE testing:  5.663953396899286


In [7]:
## == HYPERPARAMETER TUNING == ##
#VERIFYING CURRENT Xgboost HYPERPARAMETERS 

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(xg_reg.get_params())

Parameters currently in use:

{'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'reg:squarederror',
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}


In [8]:
#Apply different parameter settings

xgboost_advanced = xgb.XGBRegressor(gamma = 5,  reg_alpha =2, reg_lambda=0, max_depth = 5, learning_rate = 0.002, n_estimators=2000, random_state=42)
xgboost_advanced.fit(X_train, Y_train)

In [9]:
##DEFINING ORIGINAL Y-VALUES AND PREDICTED Y-VALUES
expected_y_xgboost  = Y_test
predicted_y_xgboost_advanced = xgboost_advanced.predict(X_test)

expected_y_train_xgboost = Y_train
predicted_y_train_xgboost_advanced = xgboost_advanced.predict(X_train)

In [10]:
##RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

print('RMSE training: ',rmse(predicted_y_train_xgboost_advanced, Y_train))
print('RMSE testing: ', rmse(predicted_y_xgboost_advanced, Y_test))
##R2
print('R2 score training: ', r2_score(Y_train, predicted_y_train_xgboost_advanced))
print('R2 score testing: ', r2_score(Y_test, predicted_y_xgboost_advanced))
##MEAN ABSOLUTE ERROR 
print('MAE training: ', mean_absolute_error(Y_train, predicted_y_train_xgboost_advanced))
print('MAE testing: ', mean_absolute_error(Y_test, predicted_y_xgboost_advanced))

RMSE training:  2.0627913624874656
RMSE testing:  8.034081343706422
R2 score training:  0.9731581423959357
R2 score testing:  0.6472570495279347
MAE training:  1.502620683546438
MAE testing:  5.501551039562429
