This code attempts to tune the XGBoosting results with different parameters.  
This code uses the native API of XGboosting, rather than the scikit-learn API of XGboost. The native API has the advantage of automatically finding the best number of boosting rounds, has a built in cross validation feature and allows for custom objective functions. The native API uses Dmatrices which contain the features and the target variable (in this case the carbon intensity). 

In [5]:
#importing modules for the implementation
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt 
from matplotlib.backends.backend_pdf import PdfPages
import xgboost as xgb
from xgboost import XGBRegressor


In [9]:
nutrition_test = pd.read_csv("Test_20.csv")
nutrition_train = pd.read_csv("Train_80.csv")
nutrition_test = nutrition_test.dropna()
nutrition_train = nutrition_train.dropna()
nutrition_train

Unnamed: 0,name,Food Group,Calories,Fat (g),Protein (g),Carbohydrate (g),Sugars (g),Fiber (g),Saturated Fats (g),Calcium (mg),...,Lutein + Zeaxanthin (mcg),"Fatty acids, total monounsaturated (mg)","Fatty acids, total polyunsaturated (mg)",20:5 n-3 (EPA) (mg),22:5 n-3 (DPA) (mg),22:6 n-3 (DHA) (mg),Caffeine (mg),Theobromine (mg),Price per Weight (£/100Gram),GHG(kgco2eq/100g)
0,Cranberry Juice Cocktail Frozen Concentrate Pr...,Beverages,47,0.00,0.01,11.81,9.76,0.0,0.000,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085000,0.1214
1,Broccoli,Vegetables,34,0.37,2.82,6.64,1.70,2.6,0.114,47,...,1403.0,31.0,112.0,0.0,0.0,0.0,0.0,0.0,0.130667,0.0437
2,Rum,Beverages,231,0.00,0.00,0.00,0.00,0.0,0.000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.142857,0.1585
3,Orange Juice Light No Pulp,Beverages,21,0.00,0.21,5.42,4.17,0.0,0.000,0,...,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069000,0.0317
4,Veggie Burgers,Beans and Lentils,177,6.30,15.70,14.27,1.07,4.9,1.440,136,...,0.0,1778.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.600000,0.2871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,Chicken Ground,Meats,201,10.31,27.14,0.00,0.00,0.0,2.647,28,...,0.0,3429.0,2893.0,10.0,12.0,9.0,0.0,0.0,0.500000,0.6092
828,Chicken Broilers Or Fryers Wing Meat Only Cook...,Meats,197,9.53,27.69,0.00,0.00,0.0,2.223,32,...,0.0,3766.0,1323.0,0.0,0.0,0.0,0.0,0.0,0.555556,0.6092
829,Bread White Wheat,Baked Foods,238,2.15,10.66,43.91,5.00,9.2,0.630,684,...,25.0,393.0,973.0,3.0,0.0,0.0,0.0,0.0,0.118750,0.1441
830,Alcoholic Beverage Daiquiri Prepared-From-Recipe,Beverages,186,0.06,0.06,6.94,5.58,0.1,0.006,3,...,0.0,6.0,16.0,0.0,0.0,0.0,0.0,0.0,0.640000,0.1585


In [13]:
#extracting the nutrition data, labelling X as this is the input to the SKlearn algorithm
X_train = nutrition_train.iloc[:,2:42]
nutrition_titles = X_train.columns
X_test = nutrition_test.iloc[:,2:42]
#extracting the greenhouse gas emissions 
y_test = nutrition_test.iloc[:,43:44]
y_train = nutrition_train.iloc[:,43:44]
y_test = np.ravel(y_test)
y_train = np.ravel(y_train)

## Building d matrices of the data 

In [15]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

## Build a baseline model and assess the mean of the carbon intensity, predicting that each food product gets the mean carbon intensity of the train set 


In [20]:
# "Learn" the mean from the training data
from sklearn.metrics import mean_absolute_error
mean_train = np.mean(y_train)
# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train
# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))
print("Baseline Mean from train is {:.2f}".format(mean_train))


Baseline MAE is 0.90
Baseline Mean from train is 0.87


## Setting up params 

In [22]:
params = {
    # Parameters that i am going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

## Numboost and early stopping rounds for test set

In [23]:
# num boost round corresponds to the number of boosting rounds or trees to build 
#depends on other params so needed to be retuned 
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#adding evaluation metric to the params dictionary
#aiming to improve MAE 
params['eval_metric'] = "mae"
#have to pass a max number of boosting rounds, set it really large but will probably find optimal number of trees before reaching 
#
num_boost_round = 999

#evals we are testing on our test data set 
# early stopping rounds is the number of rounds without improvement at which we shoudl stop 
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=20
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:0.54824
[1]	Test-mae:0.44361
[2]	Test-mae:0.36455
[3]	Test-mae:0.30109
[4]	Test-mae:0.27073
[5]	Test-mae:0.25210
[6]	Test-mae:0.23507
[7]	Test-mae:0.22786
[8]	Test-mae:0.21781
[9]	Test-mae:0.21155
[10]	Test-mae:0.20697
[11]	Test-mae:0.20415
[12]	Test-mae:0.19965
[13]	Test-mae:0.19912
[14]	Test-mae:0.19719
[15]	Test-mae:0.19652
[16]	Test-mae:0.19711
[17]	Test-mae:0.19753
[18]	Test-mae:0.19747
[19]	Test-mae:0.19795
[20]	Test-mae:0.19876
[21]	Test-mae:0.19974
[22]	Test-mae:0.19950
[23]	Test-mae:0.19928
[24]	Test-mae:0.19972
[25]	Test-mae:0.19856
[26]	Test-mae:0.19875
[27]	Test-mae:0.19851
[28]	Test-mae:0.19798
[29]	Test-mae:0.19761
[30]	Test-mae:0.19752
[31]	Test-mae:0.19751
[32]	Test-mae:0.19747
[33]	Test-mae:0.19746
[34]	Test-mae:0.19739
[35]	Test-mae:0.19770
Best MAE: 0.20 with 16 rounds


## Tuning other hyperparameters on the train set (CV)

In [31]:
#dont have to pass it the test dataset as it is splitting the train into nfolds 
#keeping one of the folds for teest purposes 
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=20
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,0.644792,0.013722,0.691110,0.050351
1,0.482951,0.011244,0.563507,0.048779
2,0.364859,0.009458,0.474381,0.047803
3,0.279253,0.008103,0.413475,0.044625
4,0.218189,0.008952,0.369198,0.040033
...,...,...,...,...
109,0.000649,0.000067,0.263720,0.036337
110,0.000633,0.000050,0.263720,0.036336
111,0.000620,0.000033,0.263722,0.036335
112,0.000613,0.000020,0.263720,0.036337


In [33]:
cv_results['test-mae-mean'].min()


0.2637192

## max_depth and min_child_weight 
### Max depth is the max number of nodes allowed from the root to the farthest leaf of a tree. Deeper trees model more complex relationships (adding more nodes) but deeper and deeper trees make splits less relevent, causing over fitting. Min child weight is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min child weight allows the algorithm to create children that correspond to fewer samples, more complex trees but again more likely to overfit. 

In [34]:
# list containing the combinations to try of the parmeters 
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]
# running a cv of each of those pairs 
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 0.2624126 for 22 rounds
CV with max_depth=9, min_child_weight=6
	MAE 0.26816579999999995 for 17 rounds
CV with max_depth=9, min_child_weight=7
	MAE 0.264638 for 16 rounds
CV with max_depth=10, min_child_weight=5
	MAE 0.260335 for 38 rounds
CV with max_depth=10, min_child_weight=6
	MAE 0.2668154 for 54 rounds
CV with max_depth=10, min_child_weight=7
	MAE 0.2742302 for 37 rounds
CV with max_depth=11, min_child_weight=5
	MAE 0.2561362 for 43 rounds
CV with max_depth=11, min_child_weight=6
	MAE 0.258045 for 79 rounds
CV with max_depth=11, min_child_weight=7
	MAE 0.2709524 for 17 rounds
Best params: 11, 5, MAE: 0.2561362


In [35]:
params['max_depth'] = 11
params['min_child_weight'] = 5

## Subsample and colsample_bytree tuning 
### these control the sampling of the dataset, which occurs at each boosting round. instead of using the whole train set I build a tree on slightly different data at each step, making the model less likely to overfit to a single sample or feature. subsample is the fraction of observations (rows) to subsample at each step. default is 1 meaning it uses all rows. Colsamplebytree is the fraction of features to use, again initially set to 1 meaning use all features

In [36]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [37]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 0.2561362 for 43 rounds
CV with subsample=1.0, colsample=0.9
	MAE 0.2623244 for 46 rounds
CV with subsample=1.0, colsample=0.8
	MAE 0.25901240000000003 for 41 rounds
CV with subsample=1.0, colsample=0.7
	MAE 0.28502700000000003 for 37 rounds
CV with subsample=0.9, colsample=1.0
	MAE 0.283728 for 107 rounds
CV with subsample=0.9, colsample=0.9
	MAE 0.290416 for 59 rounds
CV with subsample=0.9, colsample=0.8
	MAE 0.2868442 for 79 rounds
CV with subsample=0.9, colsample=0.7
	MAE 0.308263 for 34 rounds
CV with subsample=0.8, colsample=1.0
	MAE 0.3005972 for 12 rounds
CV with subsample=0.8, colsample=0.9
	MAE 0.2953984 for 16 rounds
CV with subsample=0.8, colsample=0.8
	MAE 0.305223 for 10 rounds
CV with subsample=0.8, colsample=0.7
	MAE 0.2844824 for 34 rounds
CV with subsample=0.7, colsample=1.0
	MAE 0.30370299999999995 for 17 rounds
CV with subsample=0.7, colsample=0.9
	MAE 0.3021992 for 9 rounds
CV with subsample=0.7, colsample=0.8
	MAE 0.304864

In [38]:
# so using all features and rows for this model improves results 
params['subsample'] = 1.
params['colsample_bytree'] = 1.

## ETA paramter tuning 
### This is the learning rate, corresponds to the shrinkage of weights associated to features after each round, defines amount of correction at each step. ( as boosting corrects the errors of the previous round at each round). Usually having a lower eta makes the model more robust to overfitting (so usually the lower the better for the eta) but with the lower eta you need more boosting rounds, taking more time to train eg. see example below

In [43]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params,dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['mae'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 952 ms
	MAE 0.2561362 for 43 rounds

CV with eta=0.2
Wall time: 645 ms
	MAE 0.26285800000000004 for 27 rounds

CV with eta=0.1
Wall time: 1.06 s
	MAE 0.2572242 for 53 rounds

CV with eta=0.05
Wall time: 2.06 s
	MAE 0.2570226 for 107 rounds

CV with eta=0.01
Wall time: 10.7 s
	MAE 0.2555792 for 639 rounds

CV with eta=0.005
Wall time: 16.5 s
	MAE 0.2586348 for 997 rounds

Best params: 0.01, MAE: 0.2555792


### lower and lower eta improves the MAE until 0.01, but does increase the time and number of rounds significantly, it is likely the MAE gets higher again in the eta = 0.005 due to underfitting caused by the small dataset.

In [44]:
params['eta'] = .01


In [45]:
params

{'max_depth': 11,
 'min_child_weight': 5,
 'eta': 0.01,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [46]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:0.70656
[1]	Test-mae:0.69981
[2]	Test-mae:0.69332
[3]	Test-mae:0.68704
[4]	Test-mae:0.68084
[5]	Test-mae:0.67468
[6]	Test-mae:0.66900
[7]	Test-mae:0.66310
[8]	Test-mae:0.65754
[9]	Test-mae:0.65194
[10]	Test-mae:0.64647
[11]	Test-mae:0.64095
[12]	Test-mae:0.63559
[13]	Test-mae:0.63097
[14]	Test-mae:0.62650
[15]	Test-mae:0.62230
[16]	Test-mae:0.61822
[17]	Test-mae:0.61418
[18]	Test-mae:0.60996
[19]	Test-mae:0.60589
[20]	Test-mae:0.60181
[21]	Test-mae:0.59786
[22]	Test-mae:0.59389
[23]	Test-mae:0.58980
[24]	Test-mae:0.58570
[25]	Test-mae:0.58204
[26]	Test-mae:0.57792
[27]	Test-mae:0.57444
[28]	Test-mae:0.57046
[29]	Test-mae:0.56676
[30]	Test-mae:0.56284
[31]	Test-mae:0.55908
[32]	Test-mae:0.55515
[33]	Test-mae:0.55108
[34]	Test-mae:0.54753
[35]	Test-mae:0.54335
[36]	Test-mae:0.53960
[37]	Test-mae:0.53592
[38]	Test-mae:0.53177
[39]	Test-mae:0.52794
[40]	Test-mae:0.52411
[41]	Test-mae:0.52028
[42]	Test-mae:0.51674
[43]	Test-mae:0.51302
[44]	Test-mae:0.50919
[45]	Test-mae:0.5057

[361]	Test-mae:0.19835
[362]	Test-mae:0.19823
[363]	Test-mae:0.19814
[364]	Test-mae:0.19806
[365]	Test-mae:0.19799
[366]	Test-mae:0.19796
[367]	Test-mae:0.19792
[368]	Test-mae:0.19781
[369]	Test-mae:0.19774
[370]	Test-mae:0.19768
[371]	Test-mae:0.19770
[372]	Test-mae:0.19765
[373]	Test-mae:0.19760
[374]	Test-mae:0.19755
[375]	Test-mae:0.19750
[376]	Test-mae:0.19740
[377]	Test-mae:0.19737
[378]	Test-mae:0.19734
[379]	Test-mae:0.19733
[380]	Test-mae:0.19728
[381]	Test-mae:0.19726
[382]	Test-mae:0.19723
[383]	Test-mae:0.19718
[384]	Test-mae:0.19718
[385]	Test-mae:0.19715
[386]	Test-mae:0.19716
[387]	Test-mae:0.19709
[388]	Test-mae:0.19706
[389]	Test-mae:0.19702
[390]	Test-mae:0.19697
[391]	Test-mae:0.19693
[392]	Test-mae:0.19685
[393]	Test-mae:0.19681
[394]	Test-mae:0.19676
[395]	Test-mae:0.19674
[396]	Test-mae:0.19673
[397]	Test-mae:0.19666
[398]	Test-mae:0.19658
[399]	Test-mae:0.19654
[400]	Test-mae:0.19651
[401]	Test-mae:0.19646
[402]	Test-mae:0.19640
[403]	Test-mae:0.19638
[404]	Test-