In [3]:
# Required Libraries for Model Training
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pickle

# 1. Load and Prepare the Dataset
df = pd.read_csv('cleaned_original.csv')

# 2. Define features (X) and target (y)
X = df[['calories', 'protein', 'fat', 'sodium']]  # Drop the non-predictive and target columns
y = df['rating']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Apply multiple models and evaluate their performance
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}

# Initialize model performance dictionary
model_performance = {}

# Evaluate models one by one, with fewer cross-validation folds for faster results (cv=3)
for name, model in models.items():
    print(f"Evaluating {name}...")
    cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    model_performance[name] = -cv_scores.mean()
    print(f"{name} Average MSE: {-cv_scores.mean()}")

# Select the best model based on cross-validation scores
best_model_name = min(model_performance, key=model_performance.get)
best_model = models[best_model_name]

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Test the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Selected Model: {best_model_name}, MSE on Test Set: {mse}")

# Save the best model to a pickle file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model {best_model_name} saved to 'best_model.pkl'")


Evaluating Linear Regression...
Linear Regression Average MSE: 31.144130412572764
Evaluating Random Forest...
Random Forest Average MSE: 1.7952397515438427
Evaluating Gradient Boosting...
Gradient Boosting Average MSE: 1.6956426838801277
Evaluating SVR...
SVR Average MSE: 2.0957315079880066
Selected Model: Gradient Boosting, MSE on Test Set: 1.6420804290730284
Best model Gradient Boosting saved to 'best_model.pkl'
