In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import itertools

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# sample = pd.read_csv("sample_submission.csv")

train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [3]:
noncat = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

train[noncat + ['Calories']].corr()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
Age,1.0,0.011975,0.07369,0.015656,0.017037,0.030275,0.145683
Height,0.011975,1.0,0.957967,-0.029936,-0.013234,-0.034641,-0.004026
Weight,0.07369,0.957967,1.0,-0.020845,-0.002384,-0.023717,0.015863
Duration,0.015656,-0.029936,-0.020845,1.0,0.875327,0.903066,0.959908
Heart_Rate,0.017037,-0.013234,-0.002384,0.875327,1.0,0.795972,0.908748
Body_Temp,0.030275,-0.034641,-0.023717,0.903066,0.795972,1.0,0.828671
Calories,0.145683,-0.004026,0.015863,0.959908,0.908748,0.828671,1.0


In [3]:
y = train['Calories']

X = train.drop(columns=['id', 'Calories'])
X['Sex'] = X['Sex'].astype('category')

X['BMI'] = X['Weight']/((X['Height']/100)**2)
X['Intensity'] = X['Duration']*X['Heart_Rate']

X.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,BMI,Intensity
0,male,36,189.0,82.0,26.0,101.0,41.0,22.955684,2626.0
1,female,64,163.0,60.0,8.0,85.0,39.7,22.582709,680.0
2,female,51,161.0,64.0,7.0,84.0,39.8,24.690405,588.0
3,male,20,192.0,90.0,25.0,105.0,40.7,24.414062,2625.0
4,female,38,166.0,61.0,25.0,102.0,40.6,22.13674,2550.0


In [9]:
max_depth = [10,12]
learning_rate = [0.1, 0.05, 0.02]
number_boosts = [100, 1000]
subsamp = [0.9, 1]

model_err = {}

for md, lr, nbr, ss in itertools.product(max_depth, learning_rate, number_boosts, subsamp):
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True)

    # Initialize a list to collect accuracy scores
    scores = []

    # Iterate over each fold
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 

                
        # Define parameters
        params = {
            'max_depth': md,
            'eta': lr,                           # learning rate
            'objective': 'reg:squaredlogerror',  # 
            'subsample': ss,                     # subsample rate
            'device': 'cuda'
        }

        # Initialize and train the model
        model = xgb.train(params, dtrain, num_boost_round=nbr)


        # Predict and evaluate the model
        y_pred = model.predict(xgb.DMatrix(X_test, enable_categorical=True))
        y_pred[y_pred <= 0] = 10
        score = rmsle(y_test, y_pred)
        scores.append(score)

            
    # Output the mean accuracy over all folds
    print(f"Mean Accuracy for max_depth={md}, learn_rate={lr}, num_boost={nbr}, subsample={ss}: {np.mean(scores)}")

    model_err[f"max_depth_{md}_learn_rate_{lr}_num_boost_{nbr}_subsample_{ss}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for max_depth=10, learn_rate=0.1, num_boost=100, subsample=0.9: 0.08712032640656196
Mean Accuracy for max_depth=10, learn_rate=0.1, num_boost=100, subsample=1: 0.08593389439009799
Mean Accuracy for max_depth=10, learn_rate=0.1, num_boost=1000, subsample=0.9: 0.06341848979606227
Mean Accuracy for max_depth=10, learn_rate=0.1, num_boost=1000, subsample=1: 0.06327652293058057
Mean Accuracy for max_depth=10, learn_rate=0.05, num_boost=100, subsample=0.9: 0.7329423184610636
Mean Accuracy for max_depth=10, learn_rate=0.05, num_boost=100, subsample=1: 0.7323792443788555
Mean Accuracy for max_depth=10, learn_rate=0.05, num_boost=1000, subsample=0.9: 0.06411701500723425
Mean Accuracy for max_depth=10, learn_rate=0.05, num_boost=1000, subsample=1: 0.06396488711495237
Mean Accuracy for max_depth=10, learn_rate=0.02, num_boost=100, subsample=0.9: 2.2721793471539735
Mean Accuracy for max_depth=10, learn_rate=0.02, num_boost=100, subsample=1: 2.2721596746571535
Mean Accuracy for max_de

In [6]:
max_depth = [25, 50]
learning_rate = [0.1, 0.05]
number_boosts = [100, 150]
subsamp = [0.75, 1]

model_err = {}

for md, lr, nbr, ss in itertools.product(max_depth, learning_rate, number_boosts, subsamp):
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True)

    # Initialize a list to collect accuracy scores
    scores = []

    # Iterate over each fold
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 

                
        # Define parameters
        params = {
            'max_depth': md,
            'eta': lr,                           # learning rate
            'objective': 'reg:squaredlogerror',  # 
            'subsample': ss,                     # subsample rate
            #'device': 'cuda'
        }

        # Initialize and train the model
        model = xgb.train(params, dtrain, num_boost_round=nbr)


        # Predict and evaluate the model
        y_pred = model.predict(xgb.DMatrix(X_test, enable_categorical=True))
        y_pred[y_pred <= 0] = 10
        score = rmsle(y_test, y_pred)
        scores.append(score)

            
    # Output the mean accuracy over all folds
    print(f"Mean Accuracy for max_depth={md}, learn_rate={lr}, num_boost={nbr}, subsample={ss}: {np.mean(scores)}")

    model_err[f"max_depth_{md}_learn_rate_{lr}_num_boost_{nbr}_subsample_{ss}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for max_depth=25, learn_rate=0.1, num_boost=100, subsample=0.75: 0.08906362807584735
Mean Accuracy for max_depth=25, learn_rate=0.1, num_boost=100, subsample=1: 0.0853231350492397
Mean Accuracy for max_depth=25, learn_rate=0.1, num_boost=150, subsample=0.75: 0.07076738667215557
Mean Accuracy for max_depth=25, learn_rate=0.1, num_boost=150, subsample=1: 0.06845480815950662
Mean Accuracy for max_depth=25, learn_rate=0.05, num_boost=100, subsample=0.75: 0.7339933313752841
Mean Accuracy for max_depth=25, learn_rate=0.05, num_boost=100, subsample=1: 0.7323663754278312
Mean Accuracy for max_depth=25, learn_rate=0.05, num_boost=150, subsample=0.75: 0.18861005111910173
Mean Accuracy for max_depth=25, learn_rate=0.05, num_boost=150, subsample=1: 0.1834401854216608
Mean Accuracy for max_depth=50, learn_rate=0.1, num_boost=100, subsample=0.75: 0.08914222158197238
Mean Accuracy for max_depth=50, learn_rate=0.1, num_boost=100, subsample=1: 0.0853739650273311
Mean Accuracy for max_dept

In [7]:
max_depth = [8, 12, 16, 18, 24, 28]
learning_rate = [0.15, 0.1, 0.05]
number_boosts = [100, 150, 200, 250]
subsamp = [0.75, 1]

model_err = {}

for md, lr, nbr, ss in itertools.product(max_depth, learning_rate, number_boosts, subsamp):
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True)

    # Initialize a list to collect accuracy scores
    scores = []

    # Iterate over each fold
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 

                
        # Define parameters
        params = {
            'max_depth': md,
            'eta': lr,                           # learning rate
            'objective': 'reg:squaredlogerror',  # 
            'subsample': ss,                     # subsample rate
            'device': 'cuda'
        }

        # Initialize and train the model
        model = xgb.train(params, dtrain, num_boost_round=nbr)


        # Predict and evaluate the model
        y_pred = model.predict(xgb.DMatrix(X_test, enable_categorical=True))
        y_pred[y_pred <= 0] = 10
        score = rmsle(y_test, y_pred)
        scores.append(score)

            
    # Output the mean accuracy over all folds
    print(f"Mean Accuracy for max_depth={md}, learn_rate={lr}, num_boost={nbr}, subsample={ss}: {np.mean(scores)}")

    model_err[f"max_depth_{md}_learn_rate_{lr}_num_boost_{nbr}_subsample_{ss}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=100, subsample=0.75: 0.07194357376315377
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=100, subsample=1: 0.06934903626847051
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=150, subsample=0.75: 0.06821757533943433
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=150, subsample=1: 0.06633057883795905
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=200, subsample=0.75: 0.06674479386545246
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=200, subsample=1: 0.06538101143602114
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=250, subsample=0.75: 0.06601546688831539
Mean Accuracy for max_depth=8, learn_rate=0.15, num_boost=250, subsample=1: 0.06478614795553084
Mean Accuracy for max_depth=8, learn_rate=0.1, num_boost=100, subsample=0.75: 0.0900015177462287
Mean Accuracy for max_depth=8, learn_rate=0.1, num_boost=100, subsample=1: 0.08637327296938552
Mean Accuracy for max_depth=

In [8]:
max_depth = [14, 16, 18]
learning_rate = [0.2, 0.15, 0.1]
number_boosts = [225, 250, 275, 300, 1000]
subsamp = [0.8, 1]

model_err = {}

for md, lr, nbr, ss in itertools.product(max_depth, learning_rate, number_boosts, subsamp):
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True)

    # Initialize a list to collect accuracy scores
    scores = []

    # Iterate over each fold
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 

                
        # Define parameters
        params = {
            'max_depth': md,
            'eta': lr,                           # learning rate
            'objective': 'reg:squaredlogerror',  # 
            'subsample': ss,                     # subsample rate
            'device': 'cuda'
        }

        # Initialize and train the model
        model = xgb.train(params, dtrain, num_boost_round=nbr)


        # Predict and evaluate the model
        y_pred = model.predict(xgb.DMatrix(X_test, enable_categorical=True))
        y_pred[y_pred <= 0] = 10
        score = rmsle(y_test, y_pred)
        scores.append(score)

            
    # Output the mean accuracy over all folds
    print(f"Mean Accuracy for max_depth={md}, learn_rate={lr}, num_boost={nbr}, subsample={ss}: {np.mean(scores)}")

    model_err[f"max_depth_{md}_learn_rate_{lr}_num_boost_{nbr}_subsample_{ss}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=225, subsample=0.8: 0.06506856478335676
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=225, subsample=1: 0.06442764218267696
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=250, subsample=0.8: 0.06496931802766634
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=250, subsample=1: 0.06422704104292991
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=275, subsample=0.8: 0.0647997435506165
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=275, subsample=1: 0.06424150974769095
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=300, subsample=0.8: 0.0647455724207366
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=300, subsample=1: 0.06418509797130223
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1000, subsample=0.8: 0.0649515875283648
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1000, subsample=1: 0.06339094478537181
Mean Accuracy for max_depth=14,

In [31]:
max_depth = [14, 15]
learning_rate = [0.2, 0.15, 0.1]
number_boosts = [500, 1000, 1500, 2000]
subsamp = [0.8, 1]

model_err = {}

for md, lr, nbr, ss in itertools.product(max_depth, learning_rate, number_boosts, subsamp):
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True)

    # Initialize a list to collect accuracy scores
    scores = []

    # Iterate over each fold
    for train_index, test_index in kf.split(X):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True) 

                
        # Define parameters
        params = {
            'max_depth': md,
            'eta': lr,                           # learning rate
            'objective': 'reg:squaredlogerror',  # 
            'subsample': ss,                     # subsample rate
            'device': 'cuda'
        }

        # Initialize and train the model
        model = xgb.train(params, dtrain, num_boost_round=nbr)


        # Predict and evaluate the model
        y_pred = model.predict(xgb.DMatrix(X_test, enable_categorical=True))
        y_pred[y_pred <= 0] = 10
        score = rmsle(y_test, y_pred)
        scores.append(score)

            
    # Output the mean accuracy over all folds
    print(f"Mean Accuracy for max_depth={md}, learn_rate={lr}, num_boost={nbr}, subsample={ss}: {np.mean(scores)}")

    model_err[f"max_depth_{md}_learn_rate_{lr}_num_boost_{nbr}_subsample_{ss}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=500, subsample=0.8: 0.06441515483010365
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=500, subsample=1: 0.0635822096060801
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1000, subsample=0.8: 0.06481563594247756
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1000, subsample=1: 0.06336872793337091
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1500, subsample=0.8: 0.0654725724321548
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=1500, subsample=1: 0.0635204994150172
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=2000, subsample=0.8: 0.06602025240436285
Mean Accuracy for max_depth=14, learn_rate=0.2, num_boost=2000, subsample=1: 0.06384522940595323
Mean Accuracy for max_depth=14, learn_rate=0.15, num_boost=500, subsample=0.8: 0.06441013454644193
Mean Accuracy for max_depth=14, learn_rate=0.15, num_boost=500, subsample=1: 0.0636903812917079
Mean Accuracy for max_dept

In [22]:
X_train = train.sample(n=600000, random_state=212)
y_train = X_train['Calories']

X_train.drop(columns=['id', 'Calories'], inplace=True)
X_train['Sex'] = X_train['Sex'].astype('category')
X_train['BMI'] = X_train['Weight']/((X_train['Height']/100)**2)
X_train['Intensity'] = X_train['Duration']*X_train['Heart_Rate']

X_train.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,BMI,Intensity
637285,male,40,182.0,85.0,16.0,91.0,40.2,25.661152,1456.0
137087,female,30,158.0,64.0,10.0,90.0,40.2,25.636917,900.0
11662,male,54,184.0,87.0,3.0,82.0,38.8,25.69707,246.0
83022,female,61,186.0,81.0,25.0,106.0,40.7,23.413111,2650.0
405249,female,40,169.0,69.0,11.0,99.0,39.9,24.158818,1089.0


In [None]:
# kaggle score: 0.06241

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 16,
          'eta': 0.2,
          'objective': 'reg:squaredlogerror',
          'subsample': 1,
          'device': 'cuda',
          #'tree_method': 'hist'
          }

model = xgb.train(params, dtrain, num_boost_round=300)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 10

print(rmsle(test_pred, y_train))

0.05902165692861264


In [None]:
# kaggle score: 0.06144

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 14,
          'eta': 0.15,
          'objective': 'reg:squaredlogerror',
          'subsample': 1,
          'device': 'cuda',
          #'tree_method': 'hist'
          }

model = xgb.train(params, dtrain, num_boost_round=1000)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 10

print(rmsle(test_pred, y_train))

0.05659396607442609


In [None]:
# kaggle score: 0.06101

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 14,
          'eta': 0.1,
          'objective': 'reg:squaredlogerror',
          'subsample': 1,
          'device': 'cuda',
          #'tree_method': 'hist'
          }

model = xgb.train(params, dtrain, num_boost_round=2000)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 10

print(rmsle(test_pred, y_train))

0.05581408345720477


In [None]:
# kaggle score: 0.06287

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 32,
          'eta': 0.2,
          'objective': 'reg:squaredlogerror',
          #'device': 'cuda',
          #'tree_method': 'gpu_hist'
          }

model = xgb.train(params, dtrain, num_boost_round=600)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.05182231960216766


In [None]:
# kaggle score: 0.06409

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 64,
          'eta': 0.2,
          'objective': 'reg:squaredlogerror',
          #'device': 'cuda',
          #'tree_method': 'gpu_hist'
          }

model = xgb.train(params, dtrain, num_boost_round=600)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.04930453294486195


In [None]:
# kaggle score: 0.06297

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 32,
          'eta': 0.1,
          'objective': 'reg:squaredlogerror',
          #'device': 'cuda',
          #'tree_method': 'gpu_hist'
          }

model = xgb.train(params, dtrain, num_boost_round=600)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.05531693824542557


In [None]:
# kaggle score: 0.06390

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 64,
          'eta': 0.1,
          'objective': 'reg:squaredlogerror',
          #'device': 'cuda',
          #'tree_method': 'gpu_hist'
          }

model = xgb.train(params, dtrain, num_boost_round=600)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.052893620059793325


In [None]:
# kaggle score: 0.06414

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

params = {'max_depth': 64,
          'eta': 0.1,
          'objective': 'reg:squaredlogerror',
          #'device': 'cuda',
          #'tree_method': 'gpu_hist'
          }

model = xgb.train(params, dtrain, num_boost_round=400)

test_pred = model.predict(xgb.DMatrix(X_train, enable_categorical=True))
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

0.05502459247082603


In [27]:
test.drop(columns=['id'], inplace=True)

test['Sex'] = test['Sex'].astype('category')
test['BMI'] = test['Weight']/((test['Height']/100)**2)
test['Intensity'] = test['Duration']*test['Heart_Rate']

test.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,BMI,Intensity
0,male,45,177.0,81.0,7.0,87.0,39.8,25.854639,609.0
1,male,26,200.0,97.0,20.0,101.0,40.5,24.25,2020.0
2,female,29,188.0,85.0,16.0,102.0,40.4,24.049344,1632.0
3,female,39,172.0,73.0,20.0,107.0,40.6,24.6755,2140.0
4,female,30,173.0,67.0,16.0,94.0,40.5,22.386314,1504.0


In [33]:
# make the submission file!

pred = model.predict(xgb.DMatrix(test, enable_categorical=True))

submission = pd.DataFrame()

submission['id'] = np.arange(len(test))+750000
submission['Calories'] = pred
submission.loc[submission['Calories'] < 0, 'Calories'] = 10

submission.to_csv("xgb_prediction_10.csv", index=False)

print(submission.head())
print()
print(submission['Calories'].describe())

       id    Calories
0  750000   27.663424
1  750001  110.360237
2  750002   86.528763
3  750003  125.446068
4  750004   76.078491

count    250000.000000
mean         88.096863
std          61.928688
min           0.796141
25%          34.031651
50%          76.446026
75%         135.274139
max         246.989044
Name: Calories, dtype: float64
