<a href="https://colab.research.google.com/github/BehzadBarati/Ingredient-Maps/blob/main/FlavorGraph_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Behzad Barati

Abstract:

*   This notebook produces 5 nearest neighbors of desired ingredients based on FlavorGragh embedding.
___
Input:

*   Embedding of ingredients based on FlavorGraph method
*   List of ingredients and realted IDs
*   List of desired ingredeitns for finding their neighbors

Ouput:

*   5 nearest neighbors of desired ingredeitns.
___


# Import Libraries

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive 
from heapq import nsmallest

# Load data

In [2]:
# Mount google drive to colab notebook
# Our dataset will be read as recipe_tomato.

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
# we load a file contains embedding of ingredietns as dictionary. keys are ingredients codes

DATA_PATH = "gdrive/MyDrive/Projects/Ingredient-Maps/Phase1"
infile = open(DATA_PATH+'/FlavorGraph-embedding.pickle','rb')
embedding = pickle.load(infile)
print('Number of Embeddings : ', len(embedding))
print('Type of dataset : ', type(embedding))
print('5 first keys of dataset : ', list(embedding.keys())[:5])

Number of Embeddings :  8298
Type of dataset :  <class 'dict'>
5 first keys of dataset :  ['2261', '4204', '7593', '1608', '5156']


In [18]:
pd.Series(embedding)

2261    [0.003940486, -0.17388155, 0.2085642, 0.007552...
4204    [-0.2226859, -0.10033325, 0.09181008, 0.086264...
7593    [-0.19326355, 0.3246424, 0.27423382, 0.0514823...
1608    [-0.13715972, -0.120983526, 0.12212634, 0.2044...
5156    [-0.088117406, -0.1916365, -0.039621796, 0.058...
                              ...                        
6624    [0.02057158, 0.45763093, 0.022356123, 0.511349...
5988    [0.19450483, 0.020848732, 0.6074205, -0.520402...
3762    [0.00054995867, 0.19704723, 0.2801421, 0.30073...
5288    [-0.47191662, -0.16478768, 0.4419558, -0.28081...
4926    [0.012703506, 0.3463704, 0.36302552, -0.468503...
Length: 8298, dtype: object

In [4]:
# we load a file contains name of ingredietns, code of them and also their categories

DATA_PATH = "gdrive/MyDrive/Projects/Ingredient-Maps/Phase1"
NER_codes = pd.read_csv(DATA_PATH +'/nodes_191120.csv')

print('Number of Ingredients : ', len(NER_codes))
print('Type of dataset : ', type(NER_codes))
print('first 5 ingredients:')
NER_codes.head(5)

Number of Ingredients :  8298
Type of dataset :  <class 'pandas.core.frame.DataFrame'>
first 5 ingredients:


Unnamed: 0,node_id,name,id,node_type,is_hub
0,0,1%_fat_buttermilk,,ingredient,no_hub
1,1,1%_fat_cottage_cheese,,ingredient,no_hub
2,3,10%_cream,,ingredient,no_hub
3,4,100%_bran,,ingredient,no_hub
4,5,10_inch_flour_tortilla,,ingredient,no_hub


In [5]:
# we load a file contains 15 most frequent ingredients in tomato soup

DATA_PATH = "gdrive/MyDrive/Projects/Ingredient-Maps/Phase1"
soup_ingredients = pd.read_csv(DATA_PATH +'/output2.csv', header=None)
print('Number of Ingredients : ', len(soup_ingredients))
print('Type of dataset : ', type(soup_ingredients))
print('15 most frequent ingredients:')
soup_ingredients.head(15)

Number of Ingredients :  15
Type of dataset :  <class 'pandas.core.frame.DataFrame'>
15 most frequent ingredients:


Unnamed: 0,0,1
0,tomato,1586
1,salt,1463
2,garlic,1062
3,onion,1009
4,sugar,800
5,olive_oil,663
6,butter,661
7,flour,517
8,chicken_broth,428
9,basil,424


# looking at queried ingredients

In [6]:
# we can take a look at queried ingredients to see their category and if they are Hub or not

soup_list = list(soup_ingredients[0])
NER_codes[NER_codes['name'].isin(soup_list)]

Unnamed: 0,node_id,name,id,node_type,is_hub
277,294,baking_soda,,ingredient,no_hub
308,328,basil,,ingredient,hub
727,781,butter,,ingredient,hub
974,1041,celery,,ingredient,hub
1080,1153,chicken_broth,,ingredient,no_hub
2242,2396,flour,,ingredient,hub
2611,2793,garlic,,ingredient,hub
3926,4204,milk,,ingredient,hub
4171,4472,olive_oil,,ingredient,no_hub
4175,4478,onion,,ingredient,hub


# define some helper functions

In [7]:
# a function to get name and return available information of that ingredient

def get_NER(lis):
    return NER_codes[NER_codes['name'].isin(lis)]

In [8]:
# we need to define a function to get ingredient name and output ingredient embeding

def get_embed(name):
    id = NER_codes[NER_codes['name'] == name]['node_id'].values[0]
    return embedding[str(id)]

In [9]:
# we need to define a function for returning name of ingredients based on id

def get_name(id):
    return NER_codes[NER_codes['node_id'] == int(id)]['name'].values[0]

In [10]:
# a function to return k neighbors of ingre in our embeding dictionary

def neighbor (embd_all, ingre, k):
    '''
    input = main dictionary of embeddings, queried ingredients, number of neighbors
    output = list of neighbors id
    '''

    d = {}
    point = get_embed(ingre)
    for key, val in embedding.items():
        d[key] = np.linalg.norm(val-point)
    result = nsmallest(k+1, d, key = d.get)

    return result[1:]

In [11]:
for ingred in soup_list:
    l = []
    for neigh in neighbor(embedding, ingred, 5):
        l.append(get_name(neigh))
    print(ingred)
    print(l)

tomato
['Methialdol', '2-Dodecanone', '5-Hydroxymethylfurfural', 'gamma-Ionone', '2-sec-Butylthiazole']
salt
['Heptanoic_Acid', 'Thiamine_Hydrochloride', 'thiamine', 'chocolate_spread', 'CID_644104']
garlic
['33368-82-0', '2,5-Dimethylthiophene', 'Diallyl_trisulfide', 'S-allyl-L-cysteine', '2,6-Di-tert-butyl-4-methylphenol']
onion
['227456-27-1', '2-Mercapto-2-Methyl-1-Pentanol', 'UNII-H5E892YJGG', 'Allyl_propyl_sulfide', '(Z)-Hex-4-enal']
sugar
['betaine', 'Thiamine_Hydrochloride', 'Heptanoic_Acid', 'thiamine', 'CID_644104']
olive_oil
['acorn', 'bluefish', 'sockeye_salmon', 'cuttlefish', 'spearmint']
butter
['Glycerides,_palm-oil_mono-_and_di-,_hydrogenated,_3-oxooctanoates', 'Glycerides,_palm-oil_mono-_and_di-,_hydrogenated,_3-oxododecanoates', 'cis-4-Heptenal', 'milk', 'egg']
flour
['Heptanoic_Acid', 'thiamine', 'CID_644104', 'chocolate_spread', 'fudge']
chicken_broth
['summer_savory', 'celery_rib', 'jerusalem_artichoke', 'pheasant', 'chayote']
basil
['oregano', '3-Hexanone', 'mulbe

In [12]:
l = []
for i in neighbor(embedding, 'tomato', 4):
    l.append(get_name(i))

In [13]:
l

['Methialdol', '2-Dodecanone', '5-Hydroxymethylfurfural', 'gamma-Ionone']

In [14]:
get_NER(['trail_mix', 'baking_powder', 'black_currant', 'whey', 'vanilla_extract'])

Unnamed: 0,node_id,name,id,node_type,is_hub
276,292,baking_powder,,ingredient,no_hub
453,483,black_currant,,ingredient,hub
6064,6478,trail_mix,,ingredient,hub
6219,6644,vanilla_extract,,ingredient,no_hub
6387,6821,whey,,ingredient,hub
