# DiaBite : Product Based Capstone Project
This Colab is for preprocessing datasets and making cluster for food suggestion using sklearn, pandas, numpy, and matplotlib.

## Libraries

In [None]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn

%pip install gdown

%pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import gdown
import os

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

import json

## Data Gathering

The directory tree of your workspace should be like this:

workspace
```
|-- Data_Preprocessing_and_Clustering.ipynb
|-- dataset/
    |-- diabetes-dataset.csv
    |-- FOOD-DATA-GROUP1.csv
    |-- FOOD-DATA-GROUP2.csv
    |-- FOOD-DATA-GROUP3.csv
    |-- FOOD-DATA-GROUP4.csv
    |-- FOOD-DATA-GROUP5.csv
```

### Download Dataset From Drive (Optional)

Run this following code if you not have the dataset yet.

In [None]:
os.makedirs("dataset", exist_ok=True)

In [None]:
# download foods dataset
gdown.download_folder("https://drive.google.com/drive/folders/1l6UEQH04_Lx6mwklnnvwWHmpk2bo45p9?usp=sharing", quiet=True, use_cookies=False, output="dataset")

In [None]:
# download file diabetes dataset
gdown.download('https://drive.google.com/file/d/1DuG0W9gF74BvXNuied2YhU1VOvZq53sS/view?usp=drive_link', output='dataset/diabetes-dataset.csv', quiet=False, fuzzy=True)

### Gather Diabetes Dataset

In [None]:
diabetes_df = pd.read_csv("dataset/diabetes-dataset.csv")
diabetes_df.columns

### Gather Food Dataset(s)

**Method 1**

In [None]:
foods_df = [
  pd.read_csv("dataset/FOOD-DATA-GROUP1.csv"),
  pd.read_csv("dataset/FOOD-DATA-GROUP2.csv"),
  pd.read_csv("dataset/FOOD-DATA-GROUP3.csv"),
  pd.read_csv("dataset/FOOD-DATA-GROUP4.csv"),
  pd.read_csv("dataset/FOOD-DATA-GROUP5.csv")
]

**Method 2**

In [None]:
directory = 'dataset'

temp_foods_df = []

for filename in os.listdir(directory):
    if filename.lower().startswith("food-data") and filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file and append to the list
        df = pd.read_csv(file_path)
        temp_foods_df.append(df)

# Concatenate all dataframes in the list
foods_df = [pd.concat(temp_foods_df, ignore_index=True)]

## Data Cleaning


### Cleaning Diabetes Dataset

In [None]:
# print dataset informations
print(diabetes_df.describe())

In [None]:
#check total rows
print("total rows : ", len(diabetes_df))

In [None]:
# Check missing value(s)
missing_values = diabetes_df[['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Age', 'HeartDiseaseorAttack', 'PhysHlth', 'Stroke', 'MentHlth']].isnull().sum()
print("Missing values per column:\n", missing_values)

In [None]:
# handle outliers for the dataset BMI
Q1 = diabetes_df['BMI'].quantile(0.25)
Q3 = diabetes_df['BMI'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
diabetes_df_filtered = diabetes_df[~((diabetes_df['BMI'] < lower_bound) | (diabetes_df['BMI'] > upper_bound))]
# Calculate the median of the BMI
median_bmi = diabetes_df['BMI'].median()
# Replace the outliers with the median
diabetes_df['BMI'] = np.where(((diabetes_df['BMI'] < lower_bound) | (diabetes_df['BMI'] > upper_bound)), median_bmi, diabetes_df['BMI'])

print(diabetes_df.describe())

### Cleaning Food Dataset(s)

In [None]:
for i, food_df in enumerate(foods_df):
    # Fitur: Total Nutrition
    foods_df[i]['Total_Nutrition'] = foods_df[i][['Carbohydrates', 'Protein', 'Dietary Fiber', 
      'Saturated Fats', 'Monounsaturated Fats', 'Polyunsaturated Fats', 'Vitamin D', 
      'Magnesium', 'Potassium']].sum(axis=1)
    
    # Fitur: Nutrition Density
    foods_df[i]['Nutrition_Density'] = foods_df[i]['Total_Nutrition'] / foods_df[i]['Caloric Value']

    # Fitur: Glycemic Load
    if 'Glycemic Index' in foods_df[i].columns:
        foods_df[i]['Glycemic_Load'] = (foods_df[i]['Glycemic Index'] * foods_df[i]['Carbohydrates']) / 100
    else:
        print(f"Dataset {i}: Kolom 'Glycemic Index' tidak tersedia. Skip fitur Glycemic Load.")

    # Fitur: Macronutrient Ratios
    if 'Fat' in foods_df[i].columns:
        foods_df[i]['Protein_Ratio'] = (foods_df[i]['Protein'] * 4) / foods_df[i]['Caloric Value']
        foods_df[i]['Carbohydrate_Ratio'] = (foods_df[i]['Carbohydrates'] * 4) / foods_df[i]['Caloric Value']
        foods_df[i]['Fat_Ratio'] = (foods_df[i]['Fat'] * 9) / foods_df[i]['Caloric Value']
    else:
        print(f"Dataset {i}: Kolom 'Fat' tidak tersedia. Skip fitur Macronutrient Ratios.")

    # Fitur: Nutrient-to-Calorie Ratio
    foods_df[i]['Micronutrient_Sum'] = foods_df[i][['Vitamin D', 'Magnesium', 'Potassium']].sum(axis=1)
    foods_df[i]['Nutrient_to_Calorie'] = foods_df[i]['Micronutrient_Sum'] / foods_df[i]['Caloric Value']

    # Fitur: Fiber-to-Sugar Ratio
    foods_df[i]['Fiber_to_Sugar_Ratio'] = foods_df[i]['Dietary Fiber'] / foods_df[i]['Sugars']

    # Fitur: Saturated Fat Ratio
    if 'Fat' in foods_df[i].columns:
        foods_df[i]['Saturated_Fat_Ratio'] = foods_df[i]['Saturated Fats'] / foods_df[i]['Fat']
    else:
        print(f"Dataset {i}: Kolom 'Fat' tidak tersedia. Skip fitur Saturated Fat Ratio.")


In [None]:
for i, food_df in enumerate(foods_df) :
  print(food_df.describe())

In [None]:
for i, food_df in enumerate(foods_df) :
  # Cek apakah ada nilai NaN
  print("Jumlah NaN di setiap kolom:")
  print(foods_df[i].isna().sum())

  # Cek apakah ada nilai inf atau -inf
  print("Apakah ada nilai inf atau -inf?")
  print((foods_df[i] == float('inf')).any().any(), (foods_df[i] == float('-inf')).any().any())


In [None]:
# Temukan kolom dengan nilai inf
for i, food_df in enumerate(foods_df) :
  inf_cols = foods_df[i].columns[(foods_df[i] == float('inf')).any()]
  print(f"Kolom dengan nilai inf: {inf_cols}")


In [None]:
# Hapus baris dengan NaN
for i, food_df in enumerate(foods_df) :
  foods_df[i].dropna(subset=['Nutrition_Density', 'Protein_Ratio', 'Carbohydrate_Ratio', 'Fat_Ratio',
                            'Nutrient_to_Calorie', 'Fiber_to_Sugar_Ratio', 'Saturated_Fat_Ratio'], inplace=True)


In [None]:
# Hapus baris dengan nilai inf
for i, food_df in enumerate(foods_df) :
  foods_df[i] = foods_df[i][~(foods_df[i] == float('inf')).any(axis=1)]


In [None]:
for i, food_df in enumerate(foods_df) :
  foods_df[i].drop(["Unnamed: 0.1","Unnamed: 0"], axis=1, inplace=True)

In [None]:
for i, food_df in enumerate(foods_df) :
  temp_food_df = foods_df[i].drop(columns=["food"])
  correlation_matrix = temp_food_df.corr(method='pearson')

  plt.figure(figsize=(20, 8))
  sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
  plt.title("Heatmap Korelasi Fitur")
  plt.show()

  threshold = 0.8
  redundant_features = []
  for i in range(len(correlation_matrix.columns)):
      for j in range(i + 1, len(correlation_matrix.columns)):
          if abs(correlation_matrix.iloc[i, j]) > threshold:
              feature_1 = correlation_matrix.columns[i]
              feature_2 = correlation_matrix.columns[j]
              redundant_features.append((feature_1, feature_2, correlation_matrix.iloc[i, j]))

  print("Fitur-fitur dengan korelasi tinggi (redundan):")
  for feature_1, feature_2, corr in redundant_features:
      print(f"{feature_1} - {feature_2}: korelasi = {corr:.2f}")

## Clustering Food Dataset(s)

### Using sklearn K-Means

**Features 1**

In [None]:
# features used to clustering
food_indices = ['Caloric Value', 'Carbohydrates', 'Sugars', 'Dietary Fiber',
 'Saturated Fats', 'Monounsaturated Fats', 'Polyunsaturated Fats',
 'Protein', 'Vitamin D', 'Magnesium', 'Potassium']

**Features 2**

In [None]:
# features used to clustering
food_indices = [
    'Caloric Value',
    'Carbohydrates',
    'Sugars',
    'Dietary Fiber',
    'Protein',
    'Vitamin D', 
    'Magnesium',
    'Potassium'
]

**Features 3**

In [None]:
food_indices = [
    'Caloric Value', 'Carbohydrates', 'Sugars', 'Protein', 'Dietary Fiber', 'Saturated Fats', 
    'Monounsaturated Fats', 'Polyunsaturated Fats', 'Vitamin D', 
    'Magnesium', 'Potassium', 'Nutrition_Density',
    'Protein_Ratio', 'Carbohydrate_Ratio', 'Fat_Ratio',
    'Nutrient_to_Calorie', 'Fiber_to_Sugar_Ratio', 'Saturated_Fat_Ratio'
]

In [None]:
# making scaler of the data
scaler = StandardScaler()
foods_scaled = [scaler.fit_transform(food_df[food_indices]) for food_df in foods_df]

In [None]:
# making PCA for each food dataset
pca = PCA(n_components=2)
foods_pca = [pca.fit_transform(food_scaled) for food_scaled in foods_scaled]

for i, food_df in enumerate(foods_df) :
  foods_df[i]['PCA1'] = foods_pca[i][:, 0]
  foods_df[i]['PCA2'] = foods_pca[i][:, 1]

In [None]:
# Elbow
sses = []

for food_scaled in foods_scaled:
  sse = []
  for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(food_scaled)
    sse.append(kmeans.inertia_)
    # print(sse)

  sses.append(sse)

# print(sses)

for i, sse in enumerate(sses) :
  plt.figure(figsize=(8, 5))
  plt.plot(range(1, 11), sse, marker='o')
  plt.xlabel('Jumlah cluster')
  plt.ylabel('SSE')
  plt.title(f'Elbow Food Group {i+1}')
  plt.show()

In [None]:
from sklearn.metrics import silhouette_score

# Eksperimen dengan berbagai jumlah cluster
for k in range(2, 10):
  for i, food_scaled in enumerate(foods_scaled) :
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(food_scaled)
    silhouette_avg = silhouette_score(food_scaled, kmeans.labels_)
    print(f"For k={k}, the Silhouette Score is {silhouette_avg:.2f}")

In [None]:
# 3 cluster : suggested, alternative, avoid
optimal_k = 3

for i, food_scaled in enumerate(foods_scaled) :
  kmeans = KMeans(n_clusters=optimal_k, random_state=42)
  clusters = kmeans.fit_predict(food_scaled)
  foods_df[i]['Cluster'] = clusters

In [None]:
for i, food_df in enumerate(foods_df) :
  plt.figure(figsize=(12, 8))
  sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=food_df, palette='tab10', alpha=0.9, s=100, style='Cluster')

  plt.title(f'PCA Visualization Food Group {i+1}')
  plt.legend(loc='best')
  plt.show()

In [None]:
for i, food_scaled in enumerate(foods_scaled) :
  scaler = StandardScaler()
  scaler.fit(food_scaled)

  cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
  cluster_df = pd.DataFrame(cluster_centers, columns=food_indices)
  cluster_df['Cluster'] = range(optimal_k)

  print(cluster_df)

  fig, ax = plt.subplots(figsize=(15, 10))
  sns.heatmap(cluster_df.set_index('Cluster').T, annot=True, cmap='coolwarm', ax=ax)
  plt.title(f'Cluster Centers Food Group {i+1}')
  plt.show()

**Clustering Categorization into Three Categories (Recommended, Safe, and Avoid for Diabetics)**  

Here are the steps for interpreting and analyzing the clustering results more deeply:  

**1. Heatmap Interpretation**  
Based on the heatmap:  
- **Cluster 0**: High values in "Dietary Fiber," "Magnesium," and "Potassium," with low values in "Saturated Fats" and "Sugars." These correspond to healthy foods with high fiber content.  
- **Cluster 1**: Low values in almost all features, except for a slight increase in "Vitamin D." This may represent less nutritious or snack-type foods.  
- **Cluster 2**: Very high values in "Sugars," "Saturated Fats," and other fats. This indicates high-calorie, sugary foods unsuitable for diabetics.  

**2. Cluster Categorization**  
- **Cluster 0 (Recommended)**: Foods with high fiber, low sugar, and low saturated fats. Suitable for everyone, including diabetics.  
- **Cluster 1 (Safe for Diabetics)**: Foods with moderate or neutral nutritional values, without high sugar or fat content. Safe for diabetics but not particularly nutrient-dense.  
- **Cluster 2 (Avoid for Diabetics)**: Foods high in sugar and saturated fats, and low in fiber. These are unsuitable for diabetics as they can elevate blood sugar levels.

In [None]:
for clust in range(0, optimal_k) :
  print(f"List of cluster {clust}")
  for i, food_df in enumerate(foods_df) :
    print(f"\nFood Group {i+1}")
    print(food_df[food_df['Cluster'] == clust]['food'].head(10))
    print("\n\n")

### Save The Clustered Data

In [None]:
# making clusters data
clusters_df = []
for i in range(0, optimal_k) :
  if len(clusters_df) == i :
    clusters_df.append([])

  clusters_df[i] = foods_df[0][foods_df[0]["Cluster"] == i]

for i in range(0, optimal_k) :
  for j in range(1, len(foods_df)) :
    clusters_df[i] = pd.concat([clusters_df[i], foods_df[j][foods_df[j]["Cluster"] == i]])


In [None]:
# print the data
for i in range(0, len(clusters_df)) :
  print(f"Cluster {i+1} length: {len(clusters_df[i])}")

In [None]:
# clean some features
for i in range(0, len(clusters_df)) :
  clusters_df[i].drop(columns=['Cluster', 'PCA1', 'PCA2', 'Total_Nutrition', 'Nutrition_Density',
    'Protein_Ratio', 'Carbohydrate_Ratio', 'Fat_Ratio',
    'Nutrient_to_Calorie', 'Fiber_to_Sugar_Ratio', 'Saturated_Fat_Ratio', 'Micronutrient_Sum'], inplace=True)


In [None]:
# Directory for saving cluster data
cluster_dir = "clustered_food"
os.makedirs(cluster_dir, exist_ok=True)

# Ensure unique "train" folder names
num_train = 0
while os.path.exists(f"{cluster_dir}/train{num_train}"):
  num_train += 1
train_folder = f"{cluster_dir}/train{num_train}"
os.makedirs(train_folder)

# Save each cluster's data to CSV files
for i, cluster_df in enumerate(clusters_df):
  cluster_file = f"{train_folder}/cluster_{i}.csv"
  cluster_df.to_csv(cluster_file, index=False)

print(f"Cluster data saved in '{cluster_dir}'")
print(f"Unique train folder created: '{train_folder}'")

In [None]:
# Optional
raw_dir_base = f"{cluster_dir}/raw"

# Ensure unique "raw" folder names
num_raw = 0
while os.path.exists(f"{raw_dir_base}{num_raw}"):
    num_raw += 1
raw_dir = f"{raw_dir_base}{num_raw}"
os.makedirs(raw_dir, exist_ok=True)

# Save each food group's data to CSV files
for i, food_df in enumerate(foods_df):
    food_file = f"{raw_dir}/food_group_{i+1}.csv"
    food_df.to_csv(food_file, index=False)

print(f"Food group data saved in '{raw_dir}'")

In [None]:
# Optional
!zip -r clustered_food.zip clustered_food

## Finalization

### Extract Tags of Food Cluster

In [None]:
directory = 'clustered_food/train1'

# List to hold the dataframes
dataframes = []

# Iterate over all the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("cluster") and filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file and append to the list
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Concatenate all dataframes in the list
combined_df = pd.concat(dataframes, ignore_index=True)

In [None]:
# get all food name
combined_food = " ".join([i for i in combined_df.food.to_list()])

In [None]:
# split all word in food name
food_tags = [i for i in combined_food.split(" ") if i != "with" or i != "in" or i != "on" or i != "side" or i != "eyed" or i != "dish"]

# change the data to set, to prevent duplicate word
food_tags = set(food_tags)

In [None]:
food_tags

In [None]:
len(food_tags)

In [None]:
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

data_tags = json.dumps(food_tags, cls=SetEncoder)
print(data_tags)

In [None]:
with open('food_tags.json', 'w') as f:
    json.dump(data_tags, f)

### Get Top 250

In [None]:
vector_tags = CountVectorizer(max_features=250)

vector_tags.fit_transform(combined_df.food)

top_250_tags = vector_tags.get_feature_names_out()

top_250_tags

In [None]:
with open('top_250_tags.json', 'w') as f:
    json.dump(list(top_250_tags), f)