In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_log_error as rmsle
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import KFold
import lightgbm as lgb

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [4]:
noncat = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

train[noncat+['Calories']].corr()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
Age,1.0,0.011975,0.07369,0.015656,0.017037,0.030275,0.145683
Height,0.011975,1.0,0.957967,-0.029936,-0.013234,-0.034641,-0.004026
Weight,0.07369,0.957967,1.0,-0.020845,-0.002384,-0.023717,0.015863
Duration,0.015656,-0.029936,-0.020845,1.0,0.875327,0.903066,0.959908
Heart_Rate,0.017037,-0.013234,-0.002384,0.875327,1.0,0.795972,0.908748
Body_Temp,0.030275,-0.034641,-0.023717,0.903066,0.795972,1.0,0.828671
Calories,0.145683,-0.004026,0.015863,0.959908,0.908748,0.828671,1.0


In [4]:
y = train['Calories']

X = train.drop(columns=['id', 'Calories'])
X['Sex'] = X['Sex'].astype('category')

In [23]:
number_leaves = [256,512,1024]
learning_rate = [0.25,0.2,0.1,0.05]
number_boosts = [100,150,200]

model_err = {}

for nl in number_leaves:
    for lr in learning_rate:
        for nbr in number_boosts:

            # Initialize KFold
            kf = KFold(n_splits=5, shuffle=True)

            # Initialize a list to collect accuracy scores
            scores = []

            # Iterate over each fold
            for train_index, test_index in kf.split(X):

                X_train, X_test = X.loc[train_index], X.loc[test_index]
                y_train, y_test = y.loc[train_index], y.loc[test_index]

                dtrain = lgb.Dataset(X_train, label=y_train)

                
                # Define parameters
                params = {
                    'num_leaves': nl,
                    'learning_rate': lr,
                    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
                    'metric': 'rmse',           # 
                    'force_col_wise': True,     # Optional: can help with categorical support
                    'verbose': -1
                }

                # Initialize and train the model
                model = lgb.train(params, dtrain, num_boost_round=nbr, feval=rmsle)


                # Predict and evaluate the model
                y_pred = model.predict(X_test)
                y_pred[y_pred <= 0] = 10
                score = rmsle(y_test, y_pred)
                scores.append(score)

            
            # Output the mean accuracy over all folds
            print(f"Mean Accuracy for num_leaves={nl}, learn_rate={lr}, num_boost={nbr}: {np.mean(scores)}")

            model_err[f"num_leaves_{nl}_learn_rate_{lr}_num_boost_{nbr}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for num_leaves=256, learn_rate=0.25, num_boost=100: 0.06298387323167917
Mean Accuracy for num_leaves=256, learn_rate=0.25, num_boost=150: 0.06264885271634295
Mean Accuracy for num_leaves=256, learn_rate=0.25, num_boost=200: 0.06272394614664804
Mean Accuracy for num_leaves=256, learn_rate=0.2, num_boost=100: 0.06287490513258431
Mean Accuracy for num_leaves=256, learn_rate=0.2, num_boost=150: 0.06248318198933304
Mean Accuracy for num_leaves=256, learn_rate=0.2, num_boost=200: 0.06235730078706109
Mean Accuracy for num_leaves=256, learn_rate=0.1, num_boost=100: 0.06221927638581076
Mean Accuracy for num_leaves=256, learn_rate=0.1, num_boost=150: 0.06203367081814927
Mean Accuracy for num_leaves=256, learn_rate=0.1, num_boost=200: 0.06179718921563746
Mean Accuracy for num_leaves=256, learn_rate=0.05, num_boost=100: 0.06984597117399424
Mean Accuracy for num_leaves=256, learn_rate=0.05, num_boost=150: 0.06227538252840157
Mean Accuracy for num_leaves=256, learn_rate=0.05, num_boost

In [24]:
number_leaves = [400,450,500,550,600,2000]
learning_rate = [0.1,0.05,0.01]
number_boosts = [100,150,200,500]

model_err = {}

for nl in number_leaves:
    for lr in learning_rate:
        for nbr in number_boosts:

            # Initialize KFold
            kf = KFold(n_splits=5, shuffle=True)

            # Initialize a list to collect accuracy scores
            scores = []

            # Iterate over each fold
            for train_index, test_index in kf.split(X):

                X_train, X_test = X.loc[train_index], X.loc[test_index]
                y_train, y_test = y.loc[train_index], y.loc[test_index]

                dtrain = lgb.Dataset(X_train, label=y_train)

                
                # Define parameters
                params = {
                    'num_leaves': nl,
                    'learning_rate': lr,
                    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
                    'metric': 'rmse',           # 
                    'force_col_wise': True,     # Optional: can help with categorical support
                    'verbose': -1
                }

                # Initialize and train the model
                model = lgb.train(params, dtrain, num_boost_round=nbr, feval=rmsle)


                # Predict and evaluate the model
                y_pred = model.predict(X_test)
                y_pred[y_pred <= 0] = 10
                score = rmsle(y_test, y_pred)
                scores.append(score)

            
            # Output the mean accuracy over all folds
            print(f"Mean Accuracy for num_leaves={nl}, learn_rate={lr}, num_boost={nbr}: {np.mean(scores)}")

            model_err[f"num_leaves_{nl}_learn_rate_{lr}_num_boost_{nbr}"] = np.mean(scores)

# Find the key with the minimum value
min_key = min(model_err, key=model_err.get)

# Retrieve the minimum value
min_value = model_err[min_key]

# Print the result
print(f"The key with the minimum value is '{min_key}' with a value of {min_value}.")

Mean Accuracy for num_leaves=400, learn_rate=0.1, num_boost=100: 0.06172022837194215
Mean Accuracy for num_leaves=400, learn_rate=0.1, num_boost=150: 0.06159082570807878
Mean Accuracy for num_leaves=400, learn_rate=0.1, num_boost=200: 0.061367608656331674
Mean Accuracy for num_leaves=400, learn_rate=0.1, num_boost=500: 0.061554679274835225
Mean Accuracy for num_leaves=400, learn_rate=0.05, num_boost=100: 0.0689139080400792
Mean Accuracy for num_leaves=400, learn_rate=0.05, num_boost=150: 0.061640187889388354
Mean Accuracy for num_leaves=400, learn_rate=0.05, num_boost=200: 0.06141746213337258
Mean Accuracy for num_leaves=400, learn_rate=0.05, num_boost=500: 0.06131750476205483
Mean Accuracy for num_leaves=400, learn_rate=0.01, num_boost=100: 0.5939138089927378
Mean Accuracy for num_leaves=400, learn_rate=0.01, num_boost=150: 0.44449032530303534
Mean Accuracy for num_leaves=400, learn_rate=0.01, num_boost=200: 0.326374340043051
Mean Accuracy for num_leaves=400, learn_rate=0.01, num_boos

In [25]:
X_train = train.sample(n=600000, random_state=212)
y_train = X_train['Calories']

X_train.drop(columns=['id', 'Calories'], inplace=True)
X_train['Sex'] = X_train['Sex'].astype('category')

X_train.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
637285,male,40,182.0,85.0,16.0,91.0,40.2
137087,female,30,158.0,64.0,10.0,90.0,40.2
11662,male,54,184.0,87.0,3.0,82.0,38.8
83022,female,61,186.0,81.0,25.0,106.0,40.7
405249,female,40,169.0,69.0,11.0,99.0,39.9


In [27]:
# Kaggle score 0.0????

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 550,
    'learning_rate': 0.05,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=500, feval=rmsle)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 10

print(rmsle(test_pred, y_train))

0.0558775389208406


In [None]:
# Kaggle score 0.06023

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 512,
    'learning_rate': 0.05,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=150)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 10

print(rmsle(test_pred, y_train))

0.05900664667015287


In [None]:
# Kaggle score 0.06950

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 4096,
    'learning_rate': 0.25,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=800)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 88.259470
0.012886190162029899


In [None]:
# Kaggle score 0.06451

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 4096,
    'learning_rate': 0.2,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=250)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 88.259470
0.035241875337224664


In [None]:
# Kaggle score 0.06149

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 4096,
    'learning_rate': 0.05,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=250)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 88.259470
0.051208862654664705


In [None]:
# Kaggle score 0.06309

dtrain = lgb.Dataset(X_train, label=y_train)

# Define parameters
params = {
    'num_leaves': 2048,
    'learning_rate': 0.2,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=250)

# Make predictions
test_pred = model.predict(X_train)

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 88.259470
0.044682788102133095


In [59]:
# Kaggle score 0.?

dtrain = lgb.Dataset(X_train, label=np.log1p(y_train))

# Define parameters
params = {
    'num_leaves': 1024,
    'learning_rate': 0.2,
    'objective': 'regression',  # LightGBM doesn't have squaredlogerror directly
    'metric': 'rmse',           # We'll manually compute RMSLE
    'force_col_wise': True,     # Optional: can help with categorical support
}

# Train the model
model = lgb.train(params, dtrain, num_boost_round=250)

# Make predictions
test_pred = np.expm1(model.predict(X_train))

# Ensure non-negative predictions (important for RMSLE)
test_pred[test_pred < 0] = 0

print(rmsle(test_pred, y_train))

[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 4.141099
0.03827713295897256


In [17]:
test.drop(columns=['id'], inplace=True)

test['Sex'] = test['Sex'].astype('category')

test.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,45,177.0,81.0,7.0,87.0,39.8
1,male,26,200.0,97.0,20.0,101.0,40.5
2,female,29,188.0,85.0,16.0,102.0,40.4
3,female,39,172.0,73.0,20.0,107.0,40.6
4,female,30,173.0,67.0,16.0,94.0,40.5


In [28]:
# make the submission file!

pred = model.predict(test)

submission = pd.DataFrame()

submission['id'] = np.arange(len(test))+750000
submission['Calories'] = pred
submission.loc[submission['Calories'] < 0, 'Calories'] = 10

submission.to_csv("lgb_prediction_7.csv", index=False)

print(submission.head())
print()
print(submission['Calories'].describe())

       id    Calories
0  750000   27.400827
1  750001  106.850966
2  750002   86.877382
3  750003  125.862741
4  750004   76.543693

count    250000.000000
mean         88.249042
std          62.289379
min           1.006235
25%          34.128223
50%          76.533263
75%         135.451640
max         303.868214
Name: Calories, dtype: float64
