# Simple linear regression model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Set paths
DATA_PATH = '../data/raw/train.csv'
PREPROCESSOR_PATH = '../src/preprocessor.pkl'

# Load the data
data = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {data.shape}")
data.head()

Dataset shape: (750000, 9)


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [None]:
# Split data into features and target
X = data.drop(columns=['id', 'Calories'])
y = data['Calories']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the preprocessor
preprocessor = joblib.load(PREPROCESSOR_PATH)

# Transform the data
X_train_processed = preprocessor.transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train_processed, y_train)

# Make predictions
y_train_pred = model.predict(X_train_processed)
y_val_pred = model.predict(X_val_processed)

In [None]:
# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
train_rmse = np.sqrt(train_mse)
val_rmse = np.sqrt(val_mse)
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f"Training RMSE: {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Validation R²: {val_r2:.4f}")

In [None]:
# Cross-validation
cv_scores = cross_val_score(model, X_train_processed, y_train, 
                           cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())
print(f"5-fold CV RMSE: {cv_rmse:.2f}")

In [None]:
# Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_val_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Calories')
plt.ylabel('Predicted Calories')
plt.title('Linear Regression: Actual vs Predicted Calories')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance (coefficients)
feature_names = list(X.columns)
for i, encoder in enumerate(preprocessor.transformers_):
    if encoder[0] == 'cat':
        # Get the categorical column names after one-hot encoding
        cat_cols = encoder[2]
        ohe = encoder[1].named_steps['onehot']
        cat_features = []
        for i, col in enumerate(cat_cols):
            cat_features.extend([f"{col}_{c}" for c in ohe.categories_[i]])
        # Replace the original categorical columns with the encoded ones
        for col in cat_cols:
            feature_names.remove(col)
        feature_names.extend(cat_features)

# Plot feature importance
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': model.coef_
})
coef_df = coef_df.sort_values('Coefficient', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=coef_df)
plt.title('Linear Regression Coefficients')
plt.tight_layout()
plt.show()

In [None]:
# Save the model
joblib.dump(model, '../output/models/linear_regression_model.pkl')
print("Model saved successfully!")