# Load Data

In [79]:
import json
import pandas as pd
import string
import spacy
from spacy.tokenizer import Tokenizer
from textblob import Word
from textblob import TextBlob
from spacy import displacy
import csv
import nltk
import rltk

In [80]:
nlp = spacy.load('en_core_web_sm')

# Load Food_ontology

In [81]:
import jsonlines

In [91]:
with open("food_class.jl",errors="ignore") as f:
    wr = jsonlines.Reader(f)
    
    food_name = set()
    for item in wr:
        id_1 = item["subclass"]["value"].split("/")[-1]
        name1 = item["subclassLabel"]["value"]
        
        id_2 = item["class"]["value"].split("/")[-1]
        name2 =item["classLabel"]["value"]
        if id_1 != name1:
            food_name.add(name1.lower())
            
        if id_2 != name2:
            food_name.add(name2.lower())
            
food_name = list(food_name)
len(food_name)

16625

In [92]:
food_token = dict()
for food in food_name:
    tokens = food.split(" ")
    for token in tokens:
        food_token[token] = food_token.get(token,0)+1

# load food recipe

In [7]:
with open("recipes.json","r") as f:
    recipes = json.load(f)

In [8]:
food_name += [_.lower() for _ in recipes]
len(food_name)

46254

In [9]:
"shrimp" in food_name

False

In [111]:
# token count
fre_tokens = dict()
for food in food_name:
    tokens = food.split(" ")
    
    for token in tokens:
        fre_tokens[token] = fre_tokens.get(token,0)+1
        
# remove stop word
stopword = nlp.Defaults.stop_words
for key in list(fre_tokens.keys()):
    if key in nlp.Defaults.stop_words:
        del fre_tokens[key]

In [180]:
list(nlp("pizza is good").ents)

[]

In [112]:
fre_tokens["dish"]

48

In [59]:
"""with open("food_name.txt", "w", errors="replace") as f:
    for food in food_name:
        tokens = food.split(" ")
        
        if len(tokens)==1:
            if tokens[0]:
                print(True)
                f.write(f"{tokens[0]} S\n")
                f.write("\n")
            
        else:
            count = 0
            len_tokens = len(tokens)
            
            while count < len_tokens:
                if not token:
                    print(True)
                    continue
                if count == 0:
                    f.write(f"{tokens[count]} B\n")
                elif count == len_tokens-1:
                    f.write(f"{tokens[count]} I\n")
                else:
                    f.write(f"{tokens[count]} E\n")
                count += 1
            f.write("\n")
"""
"a"

'a'

In [60]:
nlp = spacy.load('en_core_web_sm')

In [175]:
fre_tokens["moscow"]

1

# Build

In [93]:
with open("train2", "r") as f:
    sents = []
    sent = []
    foods = []
    food = []
    for line in f:
        if line != "\n":
            token, tag = line.split(" ")
            if tag == "O\n":
                food = []
            elif tag == "U\n":
                foods.append(token)
                food = []
            elif tag == "E\n":
                food.append(token)
                foods.append(" ".join(food))
                food = []
            else:
                food.append(token)
            sent.append(token)
        else:
            sents.append([" ".join(sent),foods])
            sent = []
            foods = []

In [94]:
truth = [_[1] for _ in sents]

In [95]:
# get data chunks
noun_chunks = []
for sent, foods in sents:
    sent_nlp = nlp(sent)
    chunks = list(sent_nlp.noun_chunks)
    
    # some modify
    noun_chunks.append(chunks)

In [123]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"ADJ","OP":"*"},{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"}]
pattern = [{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"},{"POS":"ADP"},{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"}]
matcher.add("NOUN_COMPOUND", None, pattern)
pattern = [{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"},{"POS":"ADP"},{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"}]
matcher.add("NOUN_ADP_NOUN", None, pattern)

In [124]:
def greedy_compound(matches, nlp_sent):
    tokens = []
    sent = len(nlp_sent)*[False]
    for _, start, end in matches:
        sent[start:end] = [True]*(end-start)
    
    hold = []
    for status, token in zip(sent, nlp_sent):
        if status:
            hold.append(token.lemma_)
        elif status or hold:
            tokens.append(" ".join(hold))
            hold = []
        elif not (status or hold):
            continue
    
    # tokens is greedy token
    return tokens

In [125]:
matches = matcher(nlp("all they need is a bakery with fresh seed bread, espresso, and italian pastries and i will be 400 pounds."))

In [126]:
couple = greedy_compound(matches, nlp("all they need is a bakery with fresh seed bread, espresso, and italian pastries and i will be 400 pounds."))
couple

['bakery', 'fresh seed bread', 'espresso', 'italian pastry', 'pound']

In [127]:
check_food("pound", food_name, fre_tokens, threshod=10)

False

In [128]:
def check_food(token, food_name, fre_tokens, threshod = 10):
    if token in food_name:
        return True
    
    words = token.split(" ")
    
    words_score = [fre_tokens.get(word,0) for word in words]
    if sum(words_score)/len(words_score)>threshod:
        return True
    
    return False

def hybrid_similarity(m ,n):
    similarity = rltk.levenshtein_similarity(m,n)
    
    if similarity > 0.7:
        similarity = 1
        return similarity
    else:
        return similarity

def check_accuracy(truth_line,predict_line):
    
    # The total num
    res = [False]*len(truth_line)
    
    for predict in predict_line:
        for idx in range(len(truth_line)):
            if not res[idx]:
                s1 = predict.split(" ")
                s2 = truth_line[idx].split(" ")
                similarity = rltk.hybrid_jaccard_similarity(set(s1), set(s2),function=hybrid_similarity)
                
                if similarity >=0.5:
                    res[idx] = True
                    
    return res

In [129]:
test_food = []
for sent, food in sents:
    sent = sent.lower()
    
    sent_nlp = nlp(sent)
    matches = matcher(sent_nlp)
    
    # obtain candidates
    food_candidates = greedy_compound(matches, sent_nlp)
    
    foods = []
    #check wther the token is food
    for food_candidate in food_candidates:
        if check_food(food_candidate, food_name, fre_tokens, threshod=10):
            foods.append(food_candidate)
            
    test_food.append([sent, foods])

In [130]:
total = sum(len(_) for _ in truth)
count = 0
for line1, line2, in zip(truth, test_food):
    _,predict_line = line2
    count += sum(check_accuracy(line1,predict_line))
count/total

0.688212927756654

In [131]:
for line1, line2, in zip(truth, test_food):
    _,predict_line = line2
    print(line1)
    print(predict_line)

['pork', 'garlic chicken platter', 'camarones enchilados', 'shrimp in creole sauce', 'mango milkshake', 'iced tea']
['pork', 'garlic chicken platter', 'shrimp in creole sauce', 'iced tea']
['mojo de ajo garlic sauce']
['side order of mojo de ajo garlic sauce', 'dish']
['raspberry iced tea']
['raspberry iced tea']
['picadillo a la criolla ground beef', 'cuban style extra lean ground beef with potatoes', 'tomato-based sauce', 'beef']
['la criolla ground beef cuban style extra lean ground beef with potato', 'tomato', 'sauce', 'perfect beef']
['bistec picadito steak strips', 'sirloin steak', 'bell peppers', 'onions', 'olive oil', 'tomato sauce']
['bistec picadito steak strip strip of sirloin steak', 'onion', 'olive oil', 'tomato sauce']
['camarones al ajillo shrimp in garlic sauce']
['camarone al ajillo shrimp', 'garlic sauce']
['smoothies', 'cuban soft drinks']
['cuban soft drink']
['moros , platanos fritos y ensalada verde', 'beans', 'rice', 'plantains', 'mixed green salad']
['bean', 'ri

In [119]:
with open("RESTAURANT.json","r", errors="replace") as f:
    data = []
    for line in f:
        data.append(json.loads(line))

In [120]:
len(data)

7644

In [163]:
count = 0
predict_dict = dict()
for rest in data:
    reviews = " ".join(rest["review_txt"])
    
    food_sent = []
    for sent in  nltk.sent_tokenize(reviews):
        sent = sent.lower()
        sent_nlp = nlp(sent)
        matches = matcher(sent_nlp)
        
        # obtain candidates
        food_candidates = greedy_compound(matches, sent_nlp)

        foods = []
        #check wther the token is food
        for food_candidate in food_candidates:
            if check_food(food_candidate, food_name, fre_tokens, threshod=10):
                foods.append(food_candidate)
        
        food_sent.append([sent, foods])
            
    predict_dict[rest["url"]] = food_sent
    print("\r",count, end="")
    count += 1

 7643

In [164]:
predict_dict["https://www.tripadvisor.com/Restaurant_Review-g32655-d10177638-Reviews-or40-Whole_Foods_Market-Los_Angeles_California.html"]

[['whole foods markets are good and this one is tops.', ['whole food market']],
 ['i love their hearst meats and local fish market.',
  ['hearst meat', 'local fish market']],
 ['we create great meals when you get such great quality from them.', []],
 ["yes it's pricier than most but the value is there.", []],
 ['same with fruit, vegetables,...eggs and cheese.',
  ['fruit', 'egg', 'cheese']],
 ["they're doing a good job during covid19 keeping things sanitized, masked but they need to control the shoppers a bit better by perhaps one way arrows.",
  []],
 ["we're all over the place.", []],
 ['everyone that works there is masked and gloves.', []],
 ['if visiting dtla go here instead of ralphs.', []],
 ['thanks whole foods for the nice shopping during this gloomy era.more we stocked up here on fruit, water and cookies for our hotel room.',
  ['thank whole food', 'fruit', 'water', 'cookie']],
 ['there is a hot buffet section that looks tempting  and facilities in the next door bar and on the

In [166]:
with open("rest_food.json", "w") as f:
    json.dump(predict_dict, f, indent=1)