In [1]:
# ----- Librairies ----- #
import pandas as pd
import re
import sys
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.tag import UnigramTagger
from nltk.corpus import brown
import webcolors
from IPython.display import display

import matplotlib.pyplot as plt
import math

from sklearn.neighbors import NearestNeighbors

sys.path.append("..")
from ADA_JEX2017.Project.Functions.str_functions import *
from ADA_JEX2017.Project.Functions.pre_process import *






In [3]:
path='../ADA_JEX2017/Project/Functions/'
#our datafile exported to csv
# ----- Loading the dataset'recipeInfo_WestWhiteHorvitz_WWW2013.csv' ----- #
data_file='./recipeInfo/recipeInfo_WestWhiteHorvitz_WWW2013_v2.csv'
raw_data = pd.read_csv(data_file ,sep=';')

In [4]:
# ----- Initializing and loading the list of techniques, units and ingredients created previously ----- #
with open('units_list.txt', 'r') as f:
    units_list = [line.rstrip('\n') for line in f]
    
with open('technique_list.txt', 'r') as f:
    techniques_list = [line.rstrip('\n') for line in f]

with open('ingredient_list.txt', 'r') as f:
    ingredient_list = [line.rstrip('\n') for line in f]

# ----- Initialize lemmatizer and apply on the data ----- #
# Lemmatizer is used to get the stem of each word in order to get a more homogeneous data
lemmatizer = WordNetLemmatizer()
ingredient_list=[lemmatizer.lemmatize(token).lower() for token in ingredient_list]

In [5]:
# ----- Make a dataframe with our data while dropping the NaN values ----- #
ingr_dataframe=raw_data[['title','ingredients_list','ingredients_bag-of-words']].copy().dropna()
ingr_dataframe = ingr_dataframe.reset_index(drop=True)
display(ingr_dataframe.head())

# Ignore upper case in the ingredients list string
ingr_dataframe['ingredients_list']=ingr_dataframe['ingredients_list'].str.lower()

#ingr_data_reduced=ingr_dataframe.head(100) # create a reduced data as draft to test when creating new functions

Unnamed: 0,title,ingredients_list,ingredients_bag-of-words
0,Easy Light Chocolate Milkshake Recipe,"put one half cup of milk, 4 tablespoons of cho...","! , . 4 ? and are blend chocolate cup enjoy ex..."
1,Lamb Stew Recipe : : Recipes : Food Network,5 yellow onions|2 turnips|5 carrots|1 stalk fe...,", 1 1\/2 1\/4 1\/8 1Â 1\/2 2 3 5 and anise app..."
2,Chocolate Bread Pudding Recipe : Paula Deen : ...,"1 (1-pound) loaf French or Italian bread, cube...",", -lrb- -rrb- 1 1-pound 1\/2 1\/4 1Â 1\/2 2 3 ..."
3,Snowball Cookies II Recipe,1/2 cup powdered sugar|1/3 cup butter or marga...,1 1\/2 1\/3 2 almond baking bisquick butter ch...
4,Old Fashioned Butterscotch Pie Recipe #31698,1 1/2 cup brown sugar|1 cup water|3 eggs|4 tab...,1 1\/2 1\/4 1Â 1\/2 3 4 brown butter cream cup...


In [6]:
# ----- Function to process the text in the ingredient list ----- # cf pre_process.py
# We notice that for some ingredients in the ingredients list, the quantity is given twice with one quantity given in volume or mass within parenthesis
# Therefore, we apply the next function to return only the wanted quantity
fun_add_preprocess(ingr_dataframe,units_list)

In [7]:
def fun_extract_ingredients(one_receipe,ingredients_list,techniques_list,units_list,to_gram=True):
    ''' Function extractiing all ingredients, quantities and possiblity technics of cooking

    '''
    lemmatizer = WordNetLemmatizer()
    if '|' in one_receipe:
        ingredients=one_receipe.split('|')
    else:
        ingredients=one_receipe.split(', ')

    dic_ingre={}
    dic_tec={}
    wasted_ingr=[]
    wasted_number=0
    for elem in ingredients:
        #split in words
        elem=elem.replace('-',' ')
        elem_list=elem.split(' ')
        #avoid special characters appearing in some recipes:
        if '&#' in elem:
            continue
        #keep only alphanumerics in each words
        elem_list=[re.sub('[^0-9a-zA-Z/. ]+', '', x) for x in elem_list]
        #keep only the root of the word
        check = [lemmatizer.lemmatize(token) for token in elem_list]
        #split str of string with stuck digit : '2cups': '2','cups'
        check=sum([re.findall(r'[A-Za-z]+|[\d./]+', x) for x in check],[])
        
        techniques=[]
        units=[]
        one_ingr=None
        no_unit=True
        no_number=True
        check = list(filter(None, check))
        for word in check:
            if word in techniques_list:#check if it belongs to our technics list
                techniques.append(word)
            elif word in ingredients_list:#check if it belongs to our ingredient list
                one_ingr=word
            elif bool(re.search(r'\d',word)) and (no_number):#check if it belongs to our unit list or is alphanumeric
                units.append(word)
                no_number=False
            elif (word in units_list) and (no_unit):
                units.append(word)
                no_unit=False
        for biword in nltk.bigrams(check): # check if we have a biword ingredient
            if ' '.join(biword) in ingredients_list:
                one_ingr=' '.join(biword)
        if one_ingr==None :      # check if we have no ingredient : avoid this element of recipe
            wasted_number=wasted_number+1
            wasted_ingr.append(' '.join(check))
            continue
        if(len(' '.join(units))==0):  # fill with a special unit if we are dealing with no quantity
            units.append('1')
            units.append('unit')
        elif no_unit:
            units.append('unit')
        elif no_number:
            units.append('1')


        units=' '.join(units)
        if to_gram:
            units=fun_unit_corrector(units)

        dic_ingre[one_ingr]=units
        dic_tec[one_ingr]=' '.join(techniques)

    return dic_ingre,dic_tec,wasted_ingr,wasted_number

In [None]:
# !!!!! ----- Test cell : to inspect a specific recipe ----- !!!!!! #
receipe=ingr_dataframe.loc[35013]['Recipe_preporcess']
print(receipe)
dic_ingr,dictec,wasted,wasted_numb=fun_extract_ingredients\
    (receipe,ingredient_list,techniques_list,units_list,to_gram=False)
dic_ingr

In [8]:
#----- Use whole data frame to extract each ingredient with its quantity and unit by using the lists  ------ #

ingr_data_reduced=ingr_dataframe.head(100)
all_dic=[]
not_used_ingr=[]
wastes=0
for index, row in ingr_dataframe.iterrows():
    recipe=row['Recipe_preporcess']
    # Function in str_functions.py to extract the ingredients for each recipe
    dic_ingre,dictec,wasted_ingr,wasted_number=fun_extract_ingredients\
            (recipe,ingredient_list,techniques_list,units_list,to_gram=True)
    # Also convert each quantity in the same unit (grams) if to_gram is set to True
    all_dic.append(dic_ingre)

# We implemented the number of ingredients which didn't fit the criteria 
# Then we plotted the ingredient that we threw away in order to complete manually our ingredient list with important ingredients that our list may miss
    #not_used_ingr.append(wasted_ingr) 
    #wastes=wastes+wasted_number
    
# ----- Create the dataframe of all the ingredient and their quantities ----- #
ingredients_frame=pd.DataFrame(data=all_dic)
display(ingredients_frame.head(5))

# ----- Print the number of ingredients ----- #
print('There are : ',len(list(ingredients_frame)), 'ingredients')
ingred_used={}
for i in list(ingredients_frame):
    ingred_used[i]=sum(ingredients_frame[i].value_counts())

Unnamed: 0,acidulated water,acorn squash,ale,aleppo pepper,allspice,almond,almond extract,almond milk,amaranth,amaretti,...,worcestershire sauce,wrap,yam,yeast,yellow lentil,yoghurt,yogurt,yuzu,zest,zucchini
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,29.625,
2,,,,,,,10.0,,,,...,,,,,,,,,,
3,,,,,,,5.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


There are :  804 ingredients


In [None]:
ingredients_frame.loc[1]['carrot']

In [None]:
(ingredients_frame['carrot'].astype(str).str.find('u'))

## Creation of the unit values in grams of each elem

In [None]:
match_ingred=np.load('match_ingred.npy')
#Analyse the matching between ingredients that we got in webscrapping  and ingredients that have a ratio of more 
#than 0.2 of 'u' unit and will pose a real problem. 
len(match_ingred)

In [None]:
ratio_u_ingred={}
test_ingredients_frame=ingredients_frame.copy()
for ingred in list(ingredients_frame):
    mean = ingredients_frame[ingred].apply(pd.to_numeric, errors='coerce').dropna(axis=0, how='any').mean()
    if math.isnan(mean):
        mean=1
    test_ingredients_frame[ingred][ingredients_frame[ingred].astype(str).str.contains('u')]=mean
    #calculate the ratio of unit in each ingredient:
    ratio_u_ingred[ingred]=\
    sum(ingredients_frame[ingred].dropna(axis=0, how='any').astype(str).str.contains('u'))/len(ingredients_frame[ingred].dropna(axis=0, how='any').astype(str).str.contains('u'))

In [None]:
ratio_u=sorted(key for (key,value) in ratio_u_ingred.items() if value>0.2)
ingred_tocheck=ratio_u
remaining_untreated=[x for x in ingred_tocheck if x not in match_ingred]
len(ingred_tocheck)

In [None]:
#Load the dataframe webscrapped
unit_scraped=pd.read_csv('bbc_scraped.csv','\t').dropna(how='all',axis=1)
unit_scraped.set_index('Unnamed: 0',inplace=True)
#unit_scraped.count(axis=0)

In [None]:
#load the list created
unit_created=pd.read_csv('BBC_unit.csv',sep=';')
unit_created.set_index('Unnamed: 0',inplace=True)

In [None]:
ingred_unit={}
for index, row  in unit_scraped.iterrows():
    units=[]
    for i,quant in enumerate(list(row.dropna().index)):
        quant=quant.replace(',','').lower()
        if any(q in units_list+['slice']+['nlea'] for q in quant.split(' ') if len(q)>0):
            a=1
        else:
            gram_val=float([re.sub('[^0-9]','',  row.dropna().values[i])][0])
            units.append(gram_val)
    if len(units)==0:
        units=200
    ingred_unit[index]=np.mean(units)
    #print(list(row.dropna().index))

In [None]:
ingred_unit

In [None]:
unit_created.head()

In [None]:
for index, row in unit_created.iterrows():
    if index in ingred_unit:
        print('weirdo')
    ingred_unit[index]=row['1 Unit']
    

In [None]:
len(ingred_unit)

In [None]:
k=0
for ingr in list(ingredients_frame):
    u_list=ingredients_frame[ingr].astype(str).str.find('u')
    k=k+1
    if k%10==0:
        print(k)
    for u_pos,u_val in enumerate(u_list):
        if u_val>0:
            if ingr in ingred_unit:
                ingredients_frame.iloc[u_pos][ingr]=\
                float([re.sub('[^0-9]','',ingredients_frame.loc[u_pos][ingr])][0])\
                *ingred_unit[ingr]
            else:
                ingredients_frame.iloc[u_pos][ingr]=\
                float([re.sub('[^0-9]','',ingredients_frame.loc[u_pos][ingr])][0])\
                *200