In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
### Data Preprocessing ###

# Load Data
df = pd.read_csv("train.csv")

# Encode male/female to 1/0
df_encoded = df.copy()
df_encoded['Sex'] = LabelEncoder().fit_transform(df_encoded["Sex"])

# Define features & objective
X = df_encoded.drop("Calories", axis=1)
Y = df_encoded["Calories"]

# Standardize input data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data further for internal validation
X_train, X_val, Y_train, Y_val = train_test_split(X_scaled, Y, test_size=0.2)

In [19]:
### Regression Model ###

lr = LinearRegression()
lr.fit(X_train, Y_train)
y_pred_lr = lr.predict(X_val)

def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

print("Linear Regression R^2 = {}".format(r2_score(Y_val, y_pred_lr)))
print("MSE:", mean_squared_error(Y_val, y_pred_lr))
print("RMSLE = {}".format(rmsle(Y_val, y_pred_lr)))

Linear Regression R^2 = 0.968370794584094
MSE: 122.72076319931607
RMSLE = 0.5602405307638778


In [20]:
## Random Forest Regressor ##

rf = RandomForestRegressor()
rf.fit(X_train, Y_train)
y_pred_rf = rf.predict(X_val)

print("Random Forest R²:", r2_score(Y_val, y_pred_rf))
print("MSE:", mean_squared_error(Y_val, y_pred_rf))
print("RMSLE = {}".format(rmsle(Y_val, y_pred_rf)))

Random Forest R²: 0.9962434144781092
MSE: 14.575486048666667
RMSLE = 0.06320351146421233


In [21]:
## Gradient Boosting ##

gb = GradientBoostingRegressor()
gb.fit(X_train, Y_train)
y_pred_gb = gb.predict(X_val)

print("Gradient Boosting R²:", r2_score(Y_val, y_pred_gb))
print("MSE:", mean_squared_error(Y_val, y_pred_gb))
print("RMSLE = {}".format(rmsle(Y_val, y_pred_gb)))

Gradient Boosting R²: 0.9941345721757007
MSE: 22.757757257048493
RMSLE = 0.13862958655627997


In [None]:
df_test = pd.read_csv("test.csv")

# Encode male/female to 1/0
df_test_encoded = df_test.copy()
df_test_encoded['Sex'] = LabelEncoder().fit_transform(df_test_encoded["Sex"])
test_scaled = scaler.transform(df_test_encoded)

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [26]:
dt_sample = pd.read_csv("sample_submission.csv")

dt_sample.head()

Unnamed: 0,id,Calories
0,750000,88.283
1,750001,88.283
2,750002,88.283
3,750003,88.283
4,750004,88.283


In [29]:
rf_pred_test = rf.predict(test_scaled)
print(np.shape(rf_pred_test))
print(rf_pred_test)

(250000,)
[ 27.27 108.34  87.73 ...  72.37 168.4   78.45]


In [30]:
# Create the submission DataFrame using the actual IDs from df_test
submission = pd.DataFrame({
    'id': df_test['id'],
    'Calories': rf_pred_test
})

# Export to CSV
submission.to_csv('submission.csv', index=False)

print("✅ Submission file created using IDs from df_test!")

✅ Submission file created using IDs from df_test!


In [31]:
submission_view = pd.read_csv("submission.csv")
submission_view.head()

Unnamed: 0,id,Calories
0,750000,27.27
1,750001,108.34
2,750002,87.73
3,750003,128.08
4,750004,75.43
