<a href="https://colab.research.google.com/github/Ananda948/FastFood-Prediction/blob/main/fastfood_nutrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Importing Dataset

# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

# Importing necessary libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset
dataset=pd.read_csv("/content/drive/MyDrive/foodDataset/FastFoodNutritionMenuV3.csv")

# Display the type and first 5 rows of the dataset
print(f"Dataset type: {type(dataset)}")
dataset.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset type: <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Company,Item,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
0,McDonald’s,Hamburger,250,80,9,3.5,0.5,25,520,31,2,6,12,247.5
1,McDonald’s,Cheeseburger,300,110,12,6.0,0.5,40,750,33,2,6,15,297.0
2,McDonald’s,Double Cheeseburger,440,210,23,11.0,1.5,80,1150,34,2,7,25,433.0
3,McDonald’s,McDouble,390,170,19,8.0,1.0,65,920,33,2,7,22,383.0
4,McDonald’s,Quarter Pounder® with Cheese,510,230,26,12.0,1.5,90,1190,40,3,9,29,502.0


In [None]:
#2. Cleaning Dataset

#a. Menghapus Duplicate Values
dataset.drop_duplicates(inplace=True)

#b. Menghapus Empty Values
dataset.dropna(inplace = True)

#c. Replace Empty Values with 0
dataset.fillna(0).head(10)

#d. Menghapus Outliers
# List of columns to process (numerical columns)
columns = ['Calories', 'Sugars\n(g)', 'Sodium \n(mg)', 'Weight Watchers\nPnts']

# Clean column names (strip whitespace and newline characters)
dataset.columns = dataset.columns.str.strip()

# Convert specified columns to numeric, forcing errors to NaN (to handle non-numeric values)
dataset[columns] = dataset[columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (optional, for cleaner analysis)
dataset.dropna(subset=columns, inplace=True)

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = dataset[columns].quantile(0.25)
Q3 = dataset[columns].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for filtering
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
dataset_filtered = dataset[~((dataset[columns] < lower_bound) | (dataset[columns] > upper_bound)).any(axis=1)]

print("----------------------------- Filtered Dataset -----------------------------")
print(dataset_filtered)

# Check Missing Values
print("----------------------------- Check Missing Values -----------------------------")
print(dataset_filtered.isnull().sum())

----------------------------- Filtered Dataset -----------------------------
         Company                               Item  Calories  \
0     McDonald’s                          Hamburger     250.0   
1     McDonald’s                       Cheeseburger     300.0   
2     McDonald’s                Double Cheeseburger     440.0   
3     McDonald’s                           McDouble     390.0   
4     McDonald’s       Quarter Pounder® with Cheese     510.0   
..           ...                                ...       ...   
513  Burger King    BK® Café Mocha Frappe- 16 fl oz     400.0   
514  Burger King    BK® Café Mocha Frappe- 20 fl oz     510.0   
515  Burger King  BK® Café Caramel Frappe- 12 fl oz     300.0   
516  Burger King  BK® Café Caramel Frappe- 16 fl oz     400.0   
517  Burger King  BK® Café Caramel Frappe- 20 fl oz     500.0   

    Calories from\nFat Total Fat\n(g) Saturated Fat\n(g) Trans Fat\n(g)  \
0                   80              9                3.5           

In [None]:
# 3. Menghitung Mean, Median, dan Standard Deviation untuk Calories, Sugars, Sodium, dan Weight Watchers Points

# Fungsi untuk menghitung mean, median, dan standard deviation dari kolom tertentu
def calculate_statistics(df, column_name):
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
    df.dropna(subset=[column_name], inplace=True)

    mean = df[column_name].mean()
    median = df[column_name].median()
    std_dev = df[column_name].std()

    print(f"{column_name}")
    print(f"Mean: {mean:.2f}")
    print(f"Median: {median:.2f}")
    print(f"Standard Deviation: {std_dev:.2f}")
    print("--------------------------------------------------------")

# Daftar kolom yang akan dihitung statistiknya
columns_to_calculate = ['Calories', 'Sugars\n(g)', 'Sodium \n(mg)', 'Weight Watchers\nPnts']

# Menghitung statistik untuk setiap kolom
for column in columns_to_calculate:
    calculate_statistics(dataset, column)


Calories
Mean: 311.70
Median: 260.00
Standard Deviation: 242.92
--------------------------------------------------------
Sugars
(g)
Mean: 28.14
Median: 13.00
Standard Deviation: 32.14
--------------------------------------------------------
Sodium 
(mg)
Mean: 409.80
Median: 160.00
Standard Deviation: 510.40
--------------------------------------------------------
Weight Watchers
Pnts
Mean: 334.76
Median: 290.00
Standard Deviation: 251.08
--------------------------------------------------------


In [None]:
# 4. Data Visualization
# Point Range Plot : Fast Food Menu Calories

# Pastikan kolom 'Calories', 'Company', dan 'Item' ada dan dalam format yang benar
dataset['Calories'] = pd.to_numeric(dataset['Calories'], errors='coerce').fillna(0)

# Mengelompokkan data berdasarkan 'Company' dan 'Item', lalu menghitung rentang kalori (min dan max)
calories_range = dataset.groupby(['Company', 'Item'])['Calories'].agg(['min', 'max']).reset_index()

# Memfilter data hanya untuk 15 baris pertama
calories_range_15 = calories_range.head(15)

# Membuat kolom 'Item_with_Company' dengan format "Company - Item"
calories_range_15['Item_with_Company'] = calories_range_15['Company'] + ' - ' + calories_range_15['Item']

# Membuat grafik menggunakan Plotly
fig = go.Figure()

# Menambahkan titik untuk nilai minimum kalori
fig.add_trace(go.Scatter(
    x=calories_range_15['Item_with_Company'],
    y=calories_range_15['min'],
    mode='markers',
    name='Minimum Calories',
    marker=dict(color='blue', size=10),
    hoverinfo='text',
    text=[f"Min: {min_val}" for min_val in calories_range_15['min']]
))

# Menambahkan titik untuk nilai maksimum kalori
fig.add_trace(go.Scatter(
    x=calories_range_15['Item_with_Company'],
    y=calories_range_15['max'],
    mode='markers',
    name='Maximum Calories',
    marker=dict(color='red', size=10),
    hoverinfo='text',
    text=[f"Max: {max_val}" for max_val in calories_range_15['max']]
))

# Menambahkan garis untuk rentang kalori
for i in range(calories_range_15.shape[0]):
    fig.add_shape(
        type='line',
        x0=calories_range_15['Item_with_Company'].iloc[i],
        y0=calories_range_15['min'].iloc[i],
        x1=calories_range_15['Item_with_Company'].iloc[i],
        y1=calories_range_15['max'].iloc[i],
        line=dict(color='gray', dash='dash')
    )

# Mengatur tata letak grafik
fig.update_layout(
    title='Point Range Plot of Fast Food Calories',
    xaxis_title='Fast Food Menu Item',
    yaxis_title='Calories',
    xaxis_tickangle=-45,
    margin=dict(l=40, r=40, t=80, b=120),
    plot_bgcolor='rgba(240, 240, 240, 1)',
    font=dict(family='Arial, sans-serif', size=12),
    width=1000,
    height=600
)

# Menampilkan grafik
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Histogram : Fast Food Menu with Highest Calories

# Pastikan kolom 'Calories' dalam bentuk numerik dan bersihkan nilai NaN
dataset['Calories'] = pd.to_numeric(dataset['Calories'], errors='coerce').fillna(0).astype(int)

# Ambil 15 item makanan cepat saji dengan kalori tertinggi
highestcal_food = dataset.groupby(['Company', 'Item'])['Calories'].sum().nlargest(15).reset_index()
highestcal_food['Company_Item'] = highestcal_food['Company'] + ' - ' + highestcal_food['Item']

# Membuat histogram
fig = px.bar(highestcal_food,
             x='Company_Item', y='Calories',
             title='Fast Food Menu with Highest Calories',
             labels={'Calories': 'Calories', 'Company_Item': 'Fast Food Menu'},
             text='Calories',
             color='Calories',
             color_continuous_scale=px.colors.sequential.Greens)

fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-45,
                  title_font=dict(size=24),
                  yaxis_title_font=dict(size=18),
                  xaxis_title_font=dict(size=18),
                  margin=dict(l=40, r=40, t=80, b=40))

fig.show()

In [None]:
# Scatter Plot : Fast Food Menu with Highest Sodium and Sugars

# Memastikan kolom 'Sugars' dan 'Sodium' dalam format numerik
dataset[['Sugars\n(g)', 'Sodium \n(mg)']] = dataset[['Sugars\n(g)', 'Sodium \n(mg)']].apply(pd.to_numeric, errors='coerce').fillna(0)

# Ambil 1 item dengan kadar gula dan sodium tertinggi dari setiap perusahaan, maksimal 6 perusahaan
sampled_data = dataset.loc[dataset.groupby('Company')[['Sugars\n(g)', 'Sodium \n(mg)']].idxmax().max(axis=1)].nlargest(6, ['Sugars\n(g)', 'Sodium \n(mg)'])

# Membuat scatter plot
fig = px.scatter(sampled_data,
                 x='Sugars\n(g)', y='Sodium \n(mg)',
                 color='Company',
                 size_max=15,
                 text='Item',
                 title='Scatter Plot of Fast Food Menu with Highest Sugars and Sodium',
                 labels={'Sugars\n(g)': 'Sugars (g)', 'Sodium \n(mg)': 'Sodium (mg)'},
                 hover_data=['Company', 'Item'])

fig.update_traces(marker=dict(size=20), textposition='top center')
fig.update_layout(title_font=dict(size=24), legend=dict(title='Company', font=dict(size=14)),
                  xaxis=dict(title_font=dict(size=18)), yaxis=dict(title_font=dict(size=18)),
                  margin=dict(l=40, r=40, t=80, b=40))

fig.show()


In [None]:
# Box Plot : Fast Food Meny by Weight Watchers Points

# Pastikan kolom 'Weight Watchers\nPnts' dalam format numerik
dataset['Weight Watchers\nPnts'] = pd.to_numeric(dataset['Weight Watchers\nPnts'], errors='coerce').fillna(0)

# Menghapus data untuk Pizza Hut dan Taco Bell
filtered_dataset = dataset[~dataset['Company'].isin(['Pizza Hut', 'Taco Bell'])]

# Mengambil data untuk box plot berdasarkan Weight Watchers Points
data_for_boxplot = filtered_dataset.groupby(['Company', 'Item'])['Weight Watchers\nPnts'].sum().reset_index()

# Membuat box plot interaktif menggunakan Plotly
fig = px.box(
    data_for_boxplot,
    x='Company',
    y='Weight Watchers\nPnts',
    title='Box Plot of Fast Food Menu by Weight Watchers Points (Excluding Pizza Hut & Taco Bell)',
    labels={'Weight Watchers\nPnts': 'Weight Watchers Points', 'Company': 'Fast Food Company'},
    color='Company',
    template='plotly_white'
)

# Mengatur tata letak grafik
fig.update_layout(
    title_font_size=22,
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    yaxis=dict(range=[0, data_for_boxplot['Weight Watchers\nPnts'].max() + 5])
)

# Menampilkan grafik
fig.show()


In [None]:
# Random Forest Prediction for Fast Food Nutritions

# Check if the columns are already numeric and convert them if necessary
columns = ['Calories', 'Sugars\n(g)', 'Sodium \n(mg)', 'Weight Watchers\nPnts']
dataset[columns] = dataset[columns].apply(pd.to_numeric, errors='coerce')

# Select 50 random menu items from the dataset
random_items = dataset.sample(n=50, random_state=42).reset_index(drop=True)

# Feature and target selection
features = random_items[['Sugars\n(g)', 'Sodium \n(mg)', 'Weight Watchers\nPnts']]
target = random_items['Calories']

# Scale the features using StandardScaler to normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Split data into training and testing sets (ensure we have enough data for testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.33, random_state=42)

# Train model with tuned hyperparameters
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=5, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
ia = 1 - (mae / np.mean(y_test))  # Improvement Accuracy

# Print evaluation results
print("+" + "-" * 55 + "+")
print("|    Evaluasi Model Random Forest                       |")
print("+" + "-" * 55 + "+")
print(f"| Mean Squared Error (MSE): {mse:.2f}{' ' * (28 - len(f'{mse:.2f}'))}|")
print(f"| R-squared (R²):           {r2:.2f}{' ' * (28 - len(f'{r2:.2f}'))}|")
print(f"| Improvement Accuracy (IA  {ia:.2%}{' ' * (28 - len(f'{ia:.2%}'))}|")
print("+" + "-" * 55 + "+")


# Prepare data for visualization
random_items['Predicted Calories'] = rf_model.predict(X_scaled)

# Combine item and company for x-axis labels (if available)
if 'Item' in random_items.columns and 'Company' in random_items.columns:
    x_labels = random_items['Item'] + ' (' + random_items['Company'] + ')'
else:
    # If 'Item' and 'Company' columns don't exist, use indices as labels
    x_labels = random_items.index.astype(str)

# Create bar plot
fig = go.Figure()

# Actual Calories
fig.add_trace(go.Bar(
    x=x_labels,  # Use combined labels for x-axis
    y=random_items['Calories'],
    name='Actual Calories',
    marker_color='blue'
))

# Predicted Calories
fig.add_trace(go.Bar(
    x=x_labels,  # Use combined labels for x-axis
    y=random_items['Predicted Calories'],
    name='Predicted Calories',
    marker_color='orange'
))

# Update layout
fig.update_layout(
    title='Perbandingan Kalori Aktual dan Kalori Prediksi',
    xaxis_title='Menu Item (Perusahaan)',
    yaxis_title='Calories',
    barmode='group',
    template='plotly_white'
)

# Show plot
fig.show()


+-------------------------------------------------------+
|    Evaluasi Model Random Forest                       |
+-------------------------------------------------------+
| Mean Squared Error (MSE): 1520.06                     |
| R-squared (R²):           0.98                        |
| Improvement Accuracy (IA  90.53%                      |
+-------------------------------------------------------+
