In [1]:
import pandas as pd
import numpy as np
import math
import json
import yaml
import operator

from itertools import combinations
from collections import Counter as ctr

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from tqdm import tqdm
import ast

In [2]:
index_food_tfidf = pd.read_csv('baseline_food_tfidfindex.csv', names=['word', 'tf-idf'])
index_food_tfidf["tf-idf"] = [ast.literal_eval(x) for x in index_food_tfidf['tf-idf']]

In [3]:
index_food_tfidf

Unnamed: 0,word,tf-idf
0,hygiene,{190115: 4.45537654793089}
1,experimented,"{679: 5.145062365603624, 1085: 2.5725311828018..."
2,barbecued,"{198: 3.3949605237266374, 1968: 10.18488157117..."
3,satiety,{90918: 5.940502063907854}
4,smith,"{549: 2.457108648186269, 1012: 3.6856629722794..."
5,ambition,"{68379: 4.20537654793089, 114787: 16.821506191..."
6,throughout,"{79: 4.5553498790121045, 2414: 3.0368999193414..."
7,brittle,"{919: 3.8642291670758935, 3493: 5.796343750613..."
8,lake,"{86: 4.695526820029015, 292: 1.565175606676338..."
9,administrator,"{163612: 5.60716873057452, 165746: 4.205376547..."


In [10]:
data_combined = pd.read_csv('recipes.csv')

In [11]:
data_combined.head()

Unnamed: 0.1,Unnamed: 0,name,id,content
0,0,arriba baked winter squash mexican style,137739,60-minutes-or-less time-to-make course main-in...
1,1,a bit different breakfast pizza,31490,30-minutes-or-less time-to-make course main-in...
2,2,all in the kitchen chili,112140,time-to-make course preparation main-dish chil...
3,3,alouette potatoes,59389,60-minutes-or-less time-to-make course main-in...
4,4,amish tomato ketchup for canning,44061,weeknight time-to-make course main-ingredient ...


In [28]:
topick = clean('guinness beef stew')
topick

['guin', 'beef', 'stew']

In [31]:
w_dict = index_food_tfidf.loc[index_food_tfidf.word == 'stew', 'tf-idf'].item()

In [12]:
def snippet (doc_id, q):
    q = clean(q)
    sent_doc = pd.DataFrame()
    cosine_df = pd.DataFrame()
    
    #Grab Full doc from corpus, since I dont keep periods
    doc = data_combined.loc[data_combined.index == doc_id, 'content'].item()
    
    #Tokenize into sentences and clean
    sent_doc['sent'] = sent_tokenize(doc)
    sent_doc['clean'] = [clean(x) for x in sent_doc.sent]
    
    #Create vector from query and sentence
    cosine = []
    for x in sent_doc.clean:
        x_set = set(x)
        q_set = set(q)
        vector = x_set.union(q_set)
    
    #Calculate vectore values for both
        q_v = []
        s_v = []
        for w in vector:
            if w in q_set:
                q_v_temp = index_food_tfidf.loc[index_food_tfidf.word == w]['tf-idf'].item().get(doc_id)
                if(q_v_temp == None):
                    q_v.append(0)
                else:
                    q_v.append(q_v_temp)
            else:
                q_v.append(0)
            if w in x_set:
                s_v_temp = index_food_tfidf.loc[index_food_tfidf.word == w]['tf-idf'].item().get(doc_id)
                if(s_v_temp == None):
                    s_v.append(0)
                else:
                    s_v.append(s_v_temp)
            else:
                s_v.append(0)
    
    #Calcumate cosine simularity
        c = 0
        for i in range(len(vector)):
            c += q_v[i] * s_v[i]
        cosine.append(c / math.sqrt((math.pow(sum(q_v), 2)) * (math.pow(sum(s_v), 2))))
    cosine_df['sim'] = cosine
    cosine_df = cosine_df.sort_values(by=['sim'], ascending =False).reset_index()
    sent_pos = list(cosine_df[0:3]['index'])
    

    snip = []
    snip.append(data_combined.name[doc_id])
    for x in sent_pos:
        snip.append(sent_doc.sent[x])
    return snip

In [13]:
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english')
coll_stops = ["prepare", "course", "dietary", "easy", "pepper", "recipe", "salt"]
stops.extend(coll_stops)
ps = PorterStemmer()

stops.extend(coll_stops)

def clean(doc):
    doc_low = doc.lower().replace("–", " ")
    words = word_tokenize(doc_low)
    words = [lemmatizer.lemmatize(w).strip() for w in words if not w in stops and w.isalpha()]
    words = [ps.stem(w) for w in words]
    return words

In [14]:
def get_candidate_resources(query):
    clean_query = clean(query)
    CR = []
    word_key_list = []
    if len(clean_query) == 1:
        only_word = clean_query[0]    #sw = single word
        sw_dict = index_food_tfidf.loc[index_food_tfidf.word == only_word, 'tf-idf'].item()
        CR.extend(list(sw_dict.keys()))
        if len(CR) > 50:
            return CR[:50]
        else: return CR
    for w in clean_query:
        w_dict = index_food_tfidf.loc[index_food_tfidf.word == w, 'tf-idf'].item()
        word_key_list.append(list(w_dict.keys()))
    CR.extend(set(word_key_list[0]).intersection(*word_key_list[1:]))
    if len(clean_query) == 2:
        if len(CR)<50:
            first_word = clean_query[0]
            second_word = clean_query[1]
            first_dict = index_food_tfidf.loc[index_food_tfidf.word == first_word, 'tf-idf'].item()
            second_dict = index_food_tfidf.loc[index_food_tfidf.word == second_word, 'tf-idf'].item()
            CR.extend(list(first_dict.keys()))
            CR.extend(list(second_dict.keys()))
            if len(set(CR))>50:
                return CR[:50]
            else: return set(CR)
        else:
            if len(CR)>50: return CR[:50]
            else: return CR
    elif len(CR)<50:     #if the list of candidate resources is less than 50
        combs = combinations(clean_query, len(query)-1)     #use n-1 terms from the query
        for comb in list(combs):        #for combination in combinations
            word_key_list = []
            for w in list(comb):        #for word in n-1 combination
                w_dict = index_food_tfidf.loc[index_food_tfidf.word == w, 'tf-idf'].item()
                word_key_list.append(list(w_dict.keys()))
            CR.extend(set(word_key_list[0]).intersection(*word_key_list[1:])) #find the intersection between all n-1 query terms
            CR = set(CR)
            if len(set(CR))<50:
                continue
            elif len(set(CR))>50:
                CR = set(CR)[:49]
                break
            else: break
    return CR[:50]

In [15]:
def relevance_ranking(q, cand_resources):
    clean_query = clean(q)
    rel_docs = {}
    rel_score = 0
    if len(clean_query)==1:
        w = clean_query[0]
        tfidf_dict = index_food_tfidf.loc[index_food_tfidf.word == w, 'tf-idf'].item()
        for d in cand_resources:
            result = tfidf_dict[d]
            rel_docs[d] = result
    else:
        for d in cand_resources:
            for word in clean_query:
                tfidf_dict = index_food_tfidf.loc[index_food_tfidf.word == word, 'tf-idf'].item()
                if tfidf_dict.get(d):
                    result = tfidf_dict[d]
                    rel_score += result
            rel_docs[d] = rel_score
    sorted_rel_docs = sorted(rel_docs.items(), key=operator.itemgetter(1), reverse=True)
    results = [i[0] for i in sorted_rel_docs[:5]]
    return results

In [32]:
q = "black forest cake"
r = get_candidate_resources(q)

In [33]:
len(r)

13

In [34]:
ranked_resources = relevance_ranking(q, r)

In [35]:
ranked_resources

[126335, 51709, 130706, 191469, 41320]

In [38]:
q = 'black forest cake'
snippet(126335, q)

ValueError: can only convert an array of size 1 to a Python scalar

In [39]:
snippet(51709, q)

ValueError: can only convert an array of size 1 to a Python scalar

In [159]:
snippet(217772, q)

['turducken roulade',
 'time-to-make course main-ingredient cuisine preparation occasion north-american main-dish poultry american cajun southern-united-states holiday-event thanksgiving meat 4-hours-or-less this is a roulade version of the cajun specialty turducken.',
 'i am also posting recipes for the cajun cornbread stuffing and the cajun rice dressing used in this recipe.',
 'boneless turkey breast boneless duck breast boneless chicken breast cornbread stuffing mix rice dressing bacon']

In [160]:
snippet(212645, q)

['the turducken of thanksgiving sides',
 'now your turducken thanksgiving just got real!',
 '60-minutes-or-less time-to-make course main-ingredient preparation occasion yams-sweet-potatoes low-protein stuffings-dressings side-dishes potatoes vegetables holiday-event dietary thanksgiving low-in-something mashed-potatoes mashed sweet potatoes with pecans centered by perfectly mashed potatoes in a stuffing crust.',
 'this side dish is easy to put together and simply delicious!']

In [156]:
#snippet(41320, q)