# Diet Recommendation System

## I. Import Libraries

In [234]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

## II. Get and clean dataset
### A. Store dataset

In [235]:
df_recipes = pd.read_csv('../Data/foodcom/recipes.csv')

In [236]:
df_recipes.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


### B. Examine columns for df_recipes

In [237]:
len_features = len(df_recipes.columns)
col_features = df_recipes.columns

print("df_recipes has "+str(len_features)+" columns:\n\n" +
      "Columns Names:".ljust(30, ' ') +
      "Unique Values:".ljust(20, ' ') +
      "Missing Values/Null:".ljust(20, ' '))

for i in col_features:
    print(i.ljust(30, ' ') + 
          str(df_recipes[i].nunique()).ljust(20, ' ') + 
          str(sum(df_recipes[i].isnull())).ljust(20, ' '))

df_recipes has 28 columns:

Columns Names:                Unique Values:      Missing Values/Null:
RecipeId                      522517              0                   
Name                          438188              0                   
AuthorId                      57178               0                   
AuthorName                    56793               0                   
CookTime                      490                 82545               
PrepTime                      318                 0                   
TotalTime                     1240                0                   
DatePublished                 245540              0                   
Description                   492838              5                   
Images                        165889              1                   
RecipeCategory                311                 751                 
Keywords                      216569              17237               
RecipeIngredientQuantities    459571             

## III. Prepare the Data
### A. Get only related columns to use for diet recommendation

In [238]:
dataset=df_recipes.copy()
columns=['RecipeId','Name','CookTime','PrepTime','TotalTime','RecipeIngredientParts',
         'Calories','FatContent','SaturatedFatContent','CholesterolContent',
         'SodiumContent','CarbohydrateContent','FiberContent','SugarContent',
         'ProteinContent','RecipeInstructions']
dataset=dataset[columns]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   RecipeId               522517 non-null  int64  
 1   Name                   522517 non-null  object 
 2   CookTime               439972 non-null  object 
 3   PrepTime               522517 non-null  object 
 4   TotalTime              522517 non-null  object 
 5   RecipeIngredientParts  522517 non-null  object 
 6   Calories               522517 non-null  float64
 7   FatContent             522517 non-null  float64
 8   SaturatedFatContent    522517 non-null  float64
 9   CholesterolContent     522517 non-null  float64
 10  SodiumContent          522517 non-null  float64
 11  CarbohydrateContent    522517 non-null  float64
 12  FiberContent           522517 non-null  float64
 13  SugarContent           522517 non-null  float64
 14  ProteinContent         522517 non-nu

In [239]:
dataset.shape

(522517, 16)

### B. Remove rows that are more than the allowed max limit

In [240]:
max_Calories=2000
max_daily_fat=100
max_daily_Saturatedfat=13
max_daily_Cholesterol=300
max_daily_Sodium=2300
max_daily_Carbohydrate=325
max_daily_Fiber=40
max_daily_Sugar=40
max_daily_Protein=200
max_list=[max_Calories, max_daily_fat, 
          max_daily_Saturatedfat, max_daily_Cholesterol, 
          max_daily_Sodium, max_daily_Carbohydrate, 
          max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

In [241]:
extracted_data = dataset.copy()
for column, maximum in zip(extracted_data.columns[6:15], max_list):
    extracted_data = extracted_data[extracted_data[column]<maximum]

In [242]:
extracted_data.shape

(375703, 16)

### C. Create and Compress new dataset

In [243]:
import gzip
with gzip.open('../Data/compressed_file/dataset.gz', 'wb') as f:
    extracted_data.to_csv(f, index=False)
    

## IV. Diet Recommendation System
### A. Preprocess the data
Calculate the correlation matrix between numeric columns 

In [244]:
extracted_data.iloc[:,6:15].corr()

Unnamed: 0,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
Calories,1.0,0.767356,0.603317,0.478934,0.501082,0.71164,0.458711,0.180895,0.689447
FatContent,0.767356,1.0,0.767357,0.440515,0.381944,0.223549,0.192142,0.042603,0.468088
SaturatedFatContent,0.603317,0.767357,1.0,0.512186,0.319671,0.176623,0.044003,0.090721,0.388618
CholesterolContent,0.478934,0.440515,0.512186,1.0,0.335843,0.066104,-0.047346,-0.036112,0.675302
SodiumContent,0.501082,0.381944,0.319671,0.335843,1.0,0.294636,0.260479,-0.055518,0.500457
CarbohydrateContent,0.71164,0.223549,0.176623,0.066104,0.294636,1.0,0.580535,0.39012,0.255447
FiberContent,0.458711,0.192142,0.044003,-0.047346,0.260479,0.580535,1.0,0.068758,0.273488
SugarContent,0.180895,0.042603,0.090721,-0.036112,-0.055518,0.39012,0.068758,1.0,-0.120441
ProteinContent,0.689447,0.468088,0.388618,0.675302,0.500457,0.255447,0.273488,-0.120441,1.0


Standardize numerical columns

In [245]:
scaler = StandardScaler()
prep_data = scaler.fit_transform(extracted_data.iloc[:,6:15].to_numpy())
prep_data

array([[-0.55093359, -0.91281917, -0.77924852, ...,  0.15672078,
         2.35502102, -0.68338127],
       [ 1.47428542,  1.13139595, -0.0647135 , ...,  3.91055068,
         2.56324444,  1.25158691],
       [-0.92414618, -1.11248669, -1.12222533, ...,  0.4855234 ,
         0.98513013, -0.60183088],
       ...,
       [ 0.49162165,  0.73206091,  1.85024037, ..., -0.61048534,
         1.76322815, -0.56476253],
       [ 0.25704672,  0.03797856,  1.02137974, ..., -0.61048534,
         1.54404561, -0.63148557],
       [-1.40937801, -1.09347074, -1.12222533, ..., -0.82968708,
        -0.94367625, -0.74269064]])

### B. Build Predictor Model
Create Model

In [246]:
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(prep_data)

Build pipeline

In [247]:
parameters = {
    'n_neighbors':5, # number of predictions
    'return_distance':False
}

# transform model so that it can be compatible to put in pipeline
transformer = FunctionTransformer(model.kneighbors, kw_args=parameters)

# pipeline is a set of steps to be taken by the new data
# (new data will go to step 1: preprocessing and step 2: model prediction)
pipeline = Pipeline([('std_scaler', scaler), ('NN', transformer)])

### C. Prediction
#### i. Predict 5 Meals

In [248]:
from random import uniform as rnd
query_point = [
    1406, # meal_calories
    # rnd(10,30), rnd(0,4), rnd(0,30), rnd(0,400), 
    # rnd(40,75), rnd(4,10),rnd(0,10),rnd(30,100)
    15.638503552043373, 3.4176276769686678, 0.031957443995366264, 
    138.63306420214982, 47.99080939985001, 8.735156296291148, 
    5.281426920090141, 86.46527919210487
]

new_data = np.array(query_point).reshape(1, -1) # always converts to one row
new_data


array([[1.40600000e+03, 1.56385036e+01, 3.41762768e+00, 3.19574440e-02,
        1.38633064e+02, 4.79908094e+01, 8.73515630e+00, 5.28142692e+00,
        8.64652792e+01]])

In [249]:
# passes new_data through each step of the pipeline sequentially
row_indexes = pipeline.transform(new_data)[0]
# this returns the row index of the predicted meals
row_indexes

array([326925, 270287,  39947, 311027, 265561], dtype=int64)

In [250]:
for meal_row_index in row_indexes:
    print(extracted_data.iloc[meal_row_index, 1])

Pinot Noir Beef Stew
Chicken &amp; Vegetable Pot Pie
Grilled Marsala Chicken & Parmesan Salad
Pan Seared Fish With Mushrooms and Scallions
Italian Chicken Zucchini &amp; Tomatoes


In [251]:
# Final Prediction in dataframe
final_preds = extracted_data.iloc[pipeline.transform(new_data)[0]]
final_preds

Unnamed: 0,RecipeId,Name,CookTime,PrepTime,TotalTime,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions
454947,471699,Pinot Noir Beef Stew,PT2H,PT20M,PT2H20M,"c(""stewing beef"", ""parsnip"", ""onion"", ""garlic ...",519.6,15.1,4.1,72.6,284.8,32.1,4.7,7.1,36.1,"""Heat olive oil in a 3 quart dutch oven over m..."
377990,391612,Chicken &amp; Vegetable Pot Pie,PT35M,PT35M,PT1H10M,"c(""boneless skinless chicken breast"", ""olive o...",387.7,14.3,3.6,48.0,324.0,35.3,4.9,10.4,25.7,"c(""Heat oven to 400°F Cook the chicken in a po..."
58359,62678,Grilled Marsala Chicken & Parmesan Salad,PT20M,PT5M,PT25M,"c(""boneless skinless chicken breast halves"", ""...",1027.0,12.7,2.9,75.0,396.2,33.8,4.3,9.8,33.2,"c(""Place chicken in a bowl with a lid, add mar..."
433028,449063,Pan Seared Fish With Mushrooms and Scallions,PT20M,PT15M,PT35M,"c(""cornstarch"", ""mushrooms"", ""scallions"", ""gar...",909.5,28.5,5.6,93.5,535.9,63.4,5.9,10.1,54.5,"c(""Season fish fillet with salt and pepper. C..."
371711,385183,Italian Chicken Zucchini &amp; Tomatoes,PT20M,PT10M,PT30M,"c(""chicken breasts"", ""angel hair pasta"", ""oliv...",387.8,16.0,4.1,51.9,374.0,33.0,3.3,6.2,23.1,"c(""Slice chicken in 1/2\"" x 2\"" inch slices an..."


#### ii. Getting the Recipes and Instructions of the final meal predictions

In [258]:
# Convert final_preds dataframe to a key-pair value which is dictionary in "records" format
final_recipes = final_preds.to_dict("records")

# view one to see result
final_recipes[0]

{'RecipeId': 471699,
 'Name': 'Pinot Noir Beef Stew',
 'CookTime': 'PT2H',
 'PrepTime': 'PT20M',
 'TotalTime': 'PT2H20M',
 'RecipeIngredientParts': 'c("stewing beef", "parsnip", "onion", "garlic cloves", "carrots", "olive oil", "bay leaf", "cornstarch", "water")',
 'Calories': 519.6,
 'FatContent': 15.1,
 'SaturatedFatContent': 4.1,
 'CholesterolContent': 72.6,
 'SodiumContent': 284.8,
 'CarbohydrateContent': 32.1,
 'FiberContent': 4.7,
 'SugarContent': 7.1,
 'ProteinContent': 36.1,
 'RecipeInstructions': '"Heat olive oil in a 3 quart dutch oven over medium high heat.  Dry stew beef and add to Dutch oven.  Brown meat thoroughly; remove from pan and add onions and garlic.  Sauté until translucent.    Add about one half cup of the wine.  Scrape the brown bits from the bottom of the pan.  Add remaining wine, beef, carrots and parsnip cubes to the pan.  Lower heat and simmer for about 10 minutes.  Add broth, bay leaf and Herbs de Provence, stir and cover pan.  Let simmer for about 2 hours.

As we can see, *RecipeIngredientParts* and *RecipeInstructions* needs to be
converted to a list of strings.

In [259]:
import re
for recipe in final_recipes:
    recipe['RecipeIngredientParts'] = re.findall(r'"([^"]*)"', recipe['RecipeIngredientParts'])
    recipe['RecipeInstructions'] = re.findall(r'"([^"]*)"', recipe['RecipeInstructions'])

This is the final look of the dictionary

In [260]:
final_recipes

[{'RecipeId': 471699,
  'Name': 'Pinot Noir Beef Stew',
  'CookTime': 'PT2H',
  'PrepTime': 'PT20M',
  'TotalTime': 'PT2H20M',
  'RecipeIngredientParts': ['stewing beef',
   'parsnip',
   'onion',
   'garlic cloves',
   'carrots',
   'olive oil',
   'bay leaf',
   'cornstarch',
   'water'],
  'Calories': 519.6,
  'FatContent': 15.1,
  'SaturatedFatContent': 4.1,
  'CholesterolContent': 72.6,
  'SodiumContent': 284.8,
  'CarbohydrateContent': 32.1,
  'FiberContent': 4.7,
  'SugarContent': 7.1,
  'ProteinContent': 36.1,
  'RecipeInstructions': ['Heat olive oil in a 3 quart dutch oven over medium high heat.  Dry stew beef and add to Dutch oven.  Brown meat thoroughly; remove from pan and add onions and garlic.  Sauté until translucent.    Add about one half cup of the wine.  Scrape the brown bits from the bottom of the pan.  Add remaining wine, beef, carrots and parsnip cubes to the pan.  Lower heat and simmer for about 10 minutes.  Add broth, bay leaf and Herbs de Provence, stir and cove