In [1]:
import numpy as np
import pandas as pd

In [2]:
ingredients = pd.read_csv('Source1Ingredients.csv')
ingredients2 = pd.read_csv('Source2Ingredients.csv')

In [3]:
from tqdm import tqdm
from stop_words import get_stop_words
stop_words = get_stop_words('en')


def isIngredient(word):
    if len(word) < 2:
        return False
    
    if isNumberInside(word):
        return False
    
    if isPreposition(word):
        return False
    
    if isNoisy(word):
        return False
    
    if isArticle(word):
        return False
    
    if isAction(word):
        return False

    if hasStrange(word):
        return False
    
    
    return True
    
    
def hasStrange(string):
    symbols = ['<', '/', '>', '&', ':', ';', '"']
    for s in symbols:
        if s in string:
            return True
    
    return False
def isAction(string):
    if string[-2:] == 'ed' or string[-2:] == 'ly' or string[-3:] == 'ing':
        return True
    return False
    
def isNumberInside (string):
    for caracter in string:
        if caracter.isnumeric():
            return True
    return False

def isPreposition (string):
    prepositions = ["aboard",'choose', 'almost',"about","above","across","after","against","along","amid","among","anti","around","as","at","before","behind","below","beneath","beside","besides","between","beyond","but","by","concerning","considering","despite","down","during","except","excepting","excluding","following","for","from","in","inside","into","like","minus","near","of","off","on","onto","opposite","outside","over","past","per","plus","regarding","round","save","since","than","through","to","toward","towards","under","underneath","unlike","until","up","upon","versus","via","with","within","without"]
    if string in prepositions:
        return True
    return False

def isNoisy (string):
    noisy = ['whole','baby', 'small','pound','finely','white','freshly','chopped', 'ounce', 'cup', 'ground', 'fresh', 'teaspoon', 'large', 'sliced', 'black', '<strong>for', "cup","tablespoon", "teaspoon", "tablespoon", "ml","if","tbsp","tbsb","tb","tbp","and","tsp","level","extra","cut","or","you","leave","plain","selection","medium","clear","few","price","yellow","new","half", "while", "when","very", "use", "two","three"]
    if string in noisy or string in stop_words:
        return True
    return False

def isArticle (string):
    articles = ["the","a","one","some","few"]
    if string in articles:
        return True
    return False
    
corpus = []
for ing in tqdm(ingredients['ingredient']):
    for word in ing.split():
        _aux = word.replace('(','').replace(')','').replace(',','').replace("'", "").replace('.', '').lower()
        if _aux[-1] == 's':
            _aux = _aux[:-1]
        if isIngredient(_aux):
            corpus.append(_aux)    

100%|█████████████████████████████████████████████████████████████████████████| 22760/22760 [00:00<00:00, 27514.95it/s]


In [4]:
for ing in tqdm(ingredients2['ingredient']):
    try:
        for word in ing.split():
            _aux = word.replace('(','').replace(')','').replace(',','').replace("'", "").replace('.', '').lower()
            if _aux[-1] == 's':
                _aux = _aux[:-1]
            if isIngredient(_aux):
                corpus.append(_aux)
    except:
        pass
        #print (ing)

100%|███████████████████████████████████████████████████████████████████████| 143165/143165 [00:05<00:00, 25500.93it/s]


In [5]:
print ("Total words in corpus: ", len(corpus))

Total words in corpus:  359826


In [6]:
set_corpus = set(corpus)

In [7]:
dict_corpus = dict.fromkeys(set_corpus, 0)
for elem in corpus:
    dict_corpus[elem] += 1

In [8]:
import operator
sorted_x = sorted(dict_corpus.items(), key=operator.itemgetter(1), reverse= True)

In [9]:
list_ingr = []
list_num = []
for ingr, num in sorted_x:
    list_ingr.append(ingr)
    list_num.append(num)

In [10]:
d = {'Ingredient': list_ingr, 'numTimes': list_num}
df = pd.DataFrame(data=d)
df_reduced = df[df['numTimes'] >10]

In [11]:
len(set_corpus)

6815

In [12]:
recepies_dict = {}
for _id, _ing in tqdm(zip(ingredients['id'], ingredients['ingredient'])):
    if not _id in recepies_dict:
        recepies_dict[_id] = []
    for word in _ing.split():
        _aux = word.replace('(','').replace(')','').replace(',','').replace("'", "").replace('.', '').lower()
        if _aux[-1] == 's':
            _aux = _aux[:-1]
        if _aux in set_corpus:
            recepies_dict[_id].append(_aux)
    
#recepies_dict

22760it [00:00, 108089.85it/s]


In [13]:
ids = list(set(ingredients['id'])) + list(set(ingredients2['id']))

In [14]:
matrix = pd.DataFrame(np.full((len(ids), len(set_corpus)), False, dtype=bool), index=ids, columns=set_corpus)

In [15]:
for key in tqdm(recepies_dict):
    for ingr in recepies_dict[key]:
        matrix[ingr][key] = True

100%|█████████████████████████████████████████████████████████████████████████████| 2467/2467 [00:18<00:00, 130.83it/s]


In [16]:
matrix

Unnamed: 0,kelt,crisper,tealeave,hibiscus-lime,cilantro-arugula,mam,velvet,gulden,lillet,messy,...,reese,witout,rooster,port,manila,flanken-style,oronoco,aliseo,scapes*,tablesopon
hot-lemon-souffles,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
spanish-style-omelette,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
madras-egg-curry,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
poached-apricots,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
slow-cooked-duck-with-cherry-sauce,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
mushroom-lentil-ragu-with-spaghetti,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
turkey-mushroom-pie,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
summer-beef-salad-with-horseradish-dressing,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
masterclass-hollandaise-sauce,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
dairy-free-poppy-seed-and-banana-pancakes,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
matrix.to_csv("RecommenderRecepieIngredient")