In [1]:
import numpy as np
import pandas as pd

In [2]:
ingredients = pd.read_csv('Source1Ingredients.csv')
ingredients2 = pd.read_csv('Source2Ingredients.csv')

In [3]:
from tqdm import tqdm
from stop_words import get_stop_words
stop_words = get_stop_words('en')


def isIngredient(word):
    if len(word) < 2:
        return False
    
    if isNumberInside(word):
        return False
    
    if isPreposition(word):
        return False
    
    if isNoisy(word):
        return False
    
    if isArticle(word):
        return False
    
    if isAction(word):
        return False

    if hasStrange(word):
        return False
    
    
    return True
    
    
def hasStrange(string):
    symbols = ['<', '/', '>', '&', ':', ';', '"']
    for s in symbols:
        if s in string:
            return True
    
    return False
def isAction(string):
    if string[-2:] == 'ed' or string[-2:] == 'ly' or string[-3:] == 'ing':
        return True
    return False
    
def isNumberInside (string):
    for caracter in string:
        if caracter.isnumeric():
            return True
    return False

def isPreposition (string):
    prepositions = ["aboard",'choose', 'almost',"about","above","across","after","against","along","amid","among","anti","around","as","at","before","behind","below","beneath","beside","besides","between","beyond","but","by","concerning","considering","despite","down","during","except","excepting","excluding","following","for","from","in","inside","into","like","minus","near","of","off","on","onto","opposite","outside","over","past","per","plus","regarding","round","save","since","than","through","to","toward","towards","under","underneath","unlike","until","up","upon","versus","via","with","within","without"]
    if string in prepositions:
        return True
    return False

def isNoisy (string):
    noisy = ['whole','baby', 'small','pound','finely','white','freshly','chopped', 'ounce', 'cup', 'ground', 'fresh', 'teaspoon', 'large', 'sliced', 'black', '<strong>for', "cup","tablespoon", "teaspoon", "tablespoon", "ml","if","tbsp","tbsb","tb","tbp","and","tsp","level","extra","cut","or","you","leave","plain","selection","medium","clear","few","price","yellow","new","half", "while", "when","very", "use", "two","three"]
    if string in noisy or string in stop_words:
        return True
    return False

def isArticle (string):
    articles = ["the","a","one","some","few"]
    if string in articles:
        return True
    return False
    
corpus = []
for ing in tqdm(ingredients['ingredient']):
    for word in ing.split():
        _aux = word.replace('(','').replace(')','').replace(',','').replace("'", "").replace('.', '').lower()
        if _aux[-1] == 's':
            _aux = _aux[:-1]
        if isIngredient(_aux):
            corpus.append(_aux)    

100%|█████████████████████████████████████████████████████████████████████████| 22760/22760 [00:00<00:00, 27714.86it/s]


In [4]:
print ("Total words in corpus: ", len(corpus))

Total words in corpus:  52349


In [5]:
set_corpus = set(corpus)

In [6]:
dict_corpus = dict.fromkeys(set_corpus, 0)
for elem in corpus:
    dict_corpus[elem] += 1

In [7]:
import operator
sorted_x = sorted(dict_corpus.items(), key=operator.itemgetter(1), reverse= True)

In [8]:
list_ingr = []
list_num = []
for ingr, num in sorted_x:
    list_ingr.append(ingr)
    list_num.append(num)

In [9]:
d = {'Ingredient': list_ingr, 'numTimes': list_num}
df = pd.DataFrame(data=d)
df_reduced = df[df['numTimes'] >10]

In [10]:
len(set_corpus)

1811

In [11]:
recepies_dict = {}
for _id, _ing in tqdm(zip(ingredients['id'], ingredients['ingredient'])):
    if not _id in recepies_dict:
        recepies_dict[_id] = []
    for word in _ing.split():
        _aux = word.replace('(','').replace(')','').replace(',','').replace("'", "").replace('.', '').lower()
        if _aux[-1] == 's':
            _aux = _aux[:-1]
        if _aux in set_corpus:
            recepies_dict[_id].append(_aux)
    
#recepies_dict

22760it [00:00, 107073.76it/s]


In [12]:
ids = list(set(ingredients['id']))# + list(set(ingredients2['id']))

In [23]:
matrix = pd.DataFrame(np.full((len(ids), len(set_corpus)), False, dtype=bool), index=ids, columns=set_corpus)

In [24]:
for key in tqdm(recepies_dict):
    for ingr in recepies_dict[key]:
        matrix[ingr][key] = True

100%|█████████████████████████████████████████████████████████████████████████████| 2467/2467 [00:04<00:00, 521.87it/s]


In [25]:
matrix

Unnamed: 0,chiller,beefsteak,serrano,azera,stroganoff,lea,ancho,gressingham,ye,camembert,...,made,grapefruit,toffee,nib,dominican,snowman,brioche,core,taj,wrapper
crispy-sesame-pork,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
baked-mexican-eggs,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
pasta-with-broad-beans-cheese,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
cheats-lasagne,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
chilli-prawn-skewers,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
nectarine-tiramisu,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
prawn-linguine,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tombstone-biscuits,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
victoria-sponge-cake,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
jerk-grilled-sea-bream,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
