 # Food Coded Cleaned Dataset Analysis

 This notebook preprocesses the dataset by handling missing values and encoding categorical features, then evaluates multiple regression models (Linear Regression, Decision Tree, Random Forest, Gradient Boosting, SVR, and KNN) using Mean Squared Error to assess their performance.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Load the data
file_path = 'C:/Users/Me/Downloads/food_coded_cleaned.csv'
data = pd.read_csv(file_path)

# Print column names and the first few rows
print("Column names:")
print(data.columns)
print("\nFirst few rows of the dataset:")
print(data.head())

# Preprocessing
# Handle non-numeric values in 'GPA' column
data['GPA'] = pd.to_numeric(data['GPA'], errors='coerce')

# Remove rows where 'GPA' is NaN
data.dropna(subset=['GPA'], inplace=True)

# Convert 'GPA' column to numeric again (should be all numeric now)
data['GPA'] = pd.to_numeric(data['GPA'])

# Handle missing values in other columns
data.fillna(method='ffill', inplace=True)  # Forward fill

# Separate features and target
target_column = 'GPA'
X = data.drop(target_column, axis=1)
y = data[target_column]

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Convert all features to numeric (coerce errors if any remain)
X = X.apply(pd.to_numeric, errors='coerce')

# Handle any remaining missing values
X.fillna(0, inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and evaluate models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor()
}

for name, model in models.items():
    print(f"\n{name}:")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

Column names:
Index(['GPA', 'Gender', 'breakfast', 'calories_chicken', 'calories_day',
       'calories_scone', 'coffee', 'comfort_food', 'comfort_food_reasons',
       'comfort_food_reasons_coded', 'cook', 'comfort_food_reasons_coded.1',
       'cuisine', 'diet_current', 'diet_current_coded', 'drink',
       'eating_changes', 'eating_changes_coded', 'eating_changes_coded1',
       'eating_out', 'employment', 'ethnic_food', 'exercise',
       'father_education', 'father_profession', 'fav_cuisine',
       'fav_cuisine_coded', 'fav_food', 'food_childhood', 'fries', 'fruit_day',
       'grade_level', 'greek_food', 'healthy_feeling', 'healthy_meal',
       'ideal_diet', 'ideal_diet_coded', 'income', 'indian_food',
       'italian_food', 'life_rewarding', 'marital_status',
       'meals_dinner_friend', 'mother_education', 'mother_profession',
       'nutritional_check', 'on_off_campus', 'parents_cook', 'pay_meal_out',
       'persian_food', 'self_perception_weight', 'soup', 'sports', 'thai_

  data.fillna(method='ffill', inplace=True)  # Forward fill



Linear Regression:
Mean Squared Error: 0.16938956290672547

Decision Tree:
Mean Squared Error: 0.2542033214285714

Random Forest:
Mean Squared Error: 0.16159898381428578

Gradient Boosting:
Mean Squared Error: 0.17598223594376589

SVR:
Mean Squared Error: 0.1610231803087349

KNN:
Mean Squared Error: 0.1897019842857142
