In [18]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_log_error
from warnings import filterwarnings
filterwarnings('ignore')

In [19]:
# Importing the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Checking the number of rows and columns of data
print("train: ",train.shape)
print("test: ", test.shape)

train:  (750000, 9)
test:  (250000, 8)


In [20]:
# See the first few lines of train data
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [21]:
# See the first few lines of test data
test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [None]:
# Missing value check
train.isnull().sum()

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [None]:
# See general information of the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [None]:
# Convert categorical variables of type object to numeric values ​​using Encoder
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

In [None]:
# Create input properties by dropping id and Calories columns
X = train.drop(['id', 'Calories'], axis=1)
y = train['Calories']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
test1 = test.drop(columns='id')
test1_scaled = scaler.transform(test1)

In [None]:
# Performing hyperparameter optimization to improve the performance of your XGBoost model

from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [100, 200, 500],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}
regressor = XGBRegressor(objective='reg:squaredlogerror', random_state=0)
random_search = RandomizedSearchCV(regressor, param_grid, n_iter=20, scoring='neg_mean_squared_log_error', cv=5, n_jobs=-1)
random_search.fit(X_train_scaled, y_train)
print("En iyi parametreler:", random_search.best_params_)
print("En iyi MSLE:", -random_search.best_score_)

En iyi parametreler: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
En iyi MSLE: 0.004340329597002768


In [None]:
# Retrain the model with the best parameters and create a new submission.csv
best_regressor = random_search.best_estimator_
unseen_pred = best_regressor.predict(test1_scaled)
unseen_pred = np.clip(unseen_pred, a_min=0, a_max=None)
submission = test[['id']].copy()
submission['Calories'] = unseen_pred
submission.to_csv('submission_optimized.csv', index=False)

In [None]:
# Predictions made by the model on the validation or test set (X_test_scaled) used in the training process
y_pred = best_regressor.predict(X_test_scaled)
y_pred

array([213.94812 , 169.30814 ,  39.089542, ..., 102.32219 , 169.86748 ,
       183.7005  ], dtype=float32)

In [None]:
# Predictions made by the model on Kaggle test data (test1_scaled)
unseen_pred = best_regressor.predict(test1_scaled)
unseen_pred

array([ 26.871107, 109.63165 ,  91.03275 , ...,  72.42474 , 170.93176 ,
        75.21723 ], dtype=float32)

In [None]:

# Creating a submission file
# test['Calories'] = unseen_pred
# submission = test[['id', 'Calories']]
# submission.to_csv('submission.csv', index=False)