In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load the dataset
df = pd.read_csv('Mc.Donalds_menu.csv')

# Extract grams from Serving Size (g)
def extract_grams_extended(serving):
    serving = str(serving)
    match_g = re.search(r'(\d+)\s*g', serving)
    if match_g:
        return int(match_g.group(1))
    match_oz = re.search(r'(\d+)\s*fl\s*oz', serving)
    if match_oz:
        return int(int(match_oz.group(1)) * 29.57)
    match_ml = re.search(r'(\d+)\s*ml', serving)
    if match_ml:
        return int(match_ml.group(1))
    return None

df['Serving Size (g)'] = df['Serving Size'].apply(extract_grams_extended)

# Select features (X) and target (y)
feature_cols = [
    'Calories from Fat', 'Total Fat', 'Total Fat (% Daily Value)',
    'Saturated Fat', 'Saturated Fat (% Daily Value)', 'Protein',
    'Carbohydrates', 'Carbohydrates (% Daily Value)',
    'Sodium', 'Sodium (% Daily Value)'
]
target_col = 'Calories'

# Remove rows with missing values
df_clean = df[feature_cols + [target_col]].dropna()

# Define X and y
X = df_clean[feature_cols].values
y = df_clean[target_col].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluation
print("\nModel Evaluation:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")


X_train shape: (182, 10)
X_test shape: (78, 10)

Model Evaluation:
R² Score: 0.9995
Mean Absolute Error: 4.11
Mean Squared Error: 32.63


In [7]:
# Compare actual vs predicted
comparison_df = pd.DataFrame({
    'Actual Calories': y_test,
    'Predicted Calories': y_pred
})

# Round predicted values for cleaner display
comparison_df['Predicted Calories'] = comparison_df['Predicted Calories'].round(2)

# Show first 10 comparisons
print("\n🔍 Actual vs Predicted Calories (sample):")
print(comparison_df.head(10))



🔍 Actual vs Predicted Calories (sample):
   Actual Calories  Predicted Calories
0              690              691.83
1              270              268.94
2              450              448.89
3              390              390.78
4              290              293.44
5              810              829.11
6              240              242.28
7              180              181.38
8              470              469.72
9              220              218.41


In [3]:
print(model.coef_, model.intercept_)

[ 17.79976532  42.551689    57.51252425  -1.50706526   2.63905175
  42.94613093 126.48586269 -12.73003016  12.89899795 -13.35353482] 370.1923076923077


In [6]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv('Mc.Donalds_menu.csv')

# Extract Serving Size (g)
def extract_grams_extended(serving):
    serving = str(serving)
    match_g = re.search(r'(\d+)\s*g', serving)
    if match_g:
        return int(match_g.group(1))
    match_oz = re.search(r'(\d+)\s*fl\s*oz', serving)
    if match_oz:
        return int(int(match_oz.group(1)) * 29.57)
    match_ml = re.search(r'(\d+)\s*ml', serving)
    if match_ml:
        return int(match_ml.group(1))
    return None

df['Serving Size (g)'] = df['Serving Size'].apply(extract_grams_extended)

# Define features and target
feature_cols = [
    'Calories from Fat', 'Total Fat', 'Total Fat (% Daily Value)',
    'Saturated Fat', 'Saturated Fat (% Daily Value)', 'Protein',
    'Carbohydrates', 'Carbohydrates (% Daily Value)',
    'Sodium', 'Sodium (% Daily Value)'
]
target_col = 'Calories'

# Clean data
df_clean = df[feature_cols + [target_col]].dropna()
X = df_clean[feature_cols].values
y = df_clean[target_col].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
model_linear = LinearRegression()
model_linear.fit(X_train_scaled, y_train)
y_pred_linear = model_linear.predict(X_test_scaled)

# Polynomial Regression (degree=2)
poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train_scaled, y_train)
y_pred_poly = poly_model.predict(X_test_scaled)

# Evaluation
def print_scores(name, y_test, y_pred):
    print(f"\n {name} Model Evaluation:")
    print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")

print_scores("Linear Regression", y_test, y_pred_linear)
print_scores("Polynomial Regression (deg=2)", y_test, y_pred_poly)



 Linear Regression Model Evaluation:
R² Score: 0.9995
MAE: 4.11
MSE: 32.63

 Polynomial Regression (deg=2) Model Evaluation:
R² Score: 0.9994
MAE: 4.51
MSE: 42.24


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [12]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv('Mc.Donalds_menu.csv')

# Extract grams from Serving Size
def extract_grams_extended(serving):
    serving = str(serving)
    match_g = re.search(r'(\d+)\s*g', serving)
    if match_g:
        return int(match_g.group(1))
    match_oz = re.search(r'(\d+)\s*fl\s*oz', serving)
    if match_oz:
        return int(int(match_oz.group(1)) * 29.57)
    match_ml = re.search(r'(\d+)\s*ml', serving)
    if match_ml:
        return int(match_ml.group(1))
    return None

df['Serving Size (g)'] = df['Serving Size'].apply(extract_grams_extended)

# Define features and target
numeric_features = [
    'Calories from Fat', 'Total Fat', 'Total Fat (% Daily Value)',
    'Saturated Fat', 'Saturated Fat (% Daily Value)', 'Protein',
    'Carbohydrates', 'Carbohydrates (% Daily Value)',
    'Sodium', 'Sodium (% Daily Value)'
]
target_col = 'Calories'

# Clean data
df_clean = df[numeric_features + ['Category', target_col]].dropna()
y = df_clean[target_col].values

# ========== VERSION A: Without Category ==========
X_a = df_clean[numeric_features].values

# ========== VERSION B: With Category ==========
category_dummies = pd.get_dummies(df_clean['Category'], drop_first=True).astype(int)
X_b = pd.concat([df_clean[numeric_features], category_dummies], axis=1).values

# ========== Split Data ==========
X_a_train, X_a_test, y_train, y_test = train_test_split(X_a, y, test_size=0.3, random_state=42)
X_b_train, X_b_test, _, _ = train_test_split(X_b, y, test_size=0.3, random_state=42)

# ========== Scale ==========
scaler_a = StandardScaler()
scaler_b = StandardScaler()
X_a_train_scaled = scaler_a.fit_transform(X_a_train).astype(np.float64)
X_a_test_scaled = scaler_a.transform(X_a_test).astype(np.float64)

X_b_train_scaled = scaler_b.fit_transform(X_b_train).astype(np.float64)
X_b_test_scaled = scaler_b.transform(X_b_test).astype(np.float64)

# ========== Train Both Models ==========
model_a = LinearRegression()
model_a.fit(X_a_train_scaled, y_train)
y_pred_a = model_a.predict(X_a_test_scaled)

model_b = LinearRegression()
model_b.fit(X_b_train_scaled, y_train)
y_pred_b = model_b.predict(X_b_test_scaled)

# ========== Evaluation ==========
def evaluate(name, y_true, y_pred):
    print(f"\n📊 {name}")
    print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_true, y_pred):.2f}")

evaluate("Model A (Without Category)", y_test, y_pred_a)
evaluate("Model B (With Category)", y_test, y_pred_b)

# ========== Comparison Table ==========
comparison_df = pd.DataFrame({
    'Actual Calories': y_test,
    'Predicted (No Category)': np.round(y_pred_a, 2),
    'Predicted (With Category)': np.round(y_pred_b, 2)
})

print("\n🔍 Sample Comparison:\n", comparison_df.head(10))

# Optional: Save to CSV
# comparison_df.to_csv('compare_predictions_category.csv', index=False)



📊 Model A (Without Category)
R² Score: 0.9995
MAE: 4.11
MSE: 32.63

📊 Model B (With Category)
R² Score: 0.9996
MAE: 3.81
MSE: 26.35

🔍 Sample Comparison:
    Actual Calories  Predicted (No Category)  Predicted (With Category)
0              690                   691.83                     692.02
1              270                   268.94                     269.59
2              450                   448.89                     447.94
3              390                   390.78                     390.42
4              290                   293.44                     293.40
5              810                   829.11                     829.95
6              240                   242.28                     242.99
7              180                   181.38                     182.54
8              470                   469.72                     469.47
9              220                   218.41                     219.23


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
