In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
import jsonlines
import rltk

In [4]:
with open("food_class.jl",errors="ignore") as f:
    wr = jsonlines.Reader(f)
    
    food_class_dict = dict()
    idx_name = dict()
    for item in wr:
        id_1 = item["subclass"]["value"].split("/")[-1]
        name1 = item["subclassLabel"]["value"]
        
        id_2 = item["class"]["value"].split("/")[-1]
        name2 =item["classLabel"]["value"]
        if id_1 != name1 and id_2 != name2:
            food_class_dict[name1.lower()] = id_1
            food_class_dict[name2.lower()] = id_2
            idx_name[id_1] = name1.lower()
            idx_name[id_2] = name2.lower()

In [5]:
food_class_dict["food"]

'Q2095'

In [6]:
# token count
fre_tokens = dict()
for food in food_class_dict:
    tokens = food.split(" ")
    
    for token in tokens:
        temp = fre_tokens.get(token,set())
        temp.add(food_class_dict[food])
        fre_tokens[token] = temp
        
# remove stop word
stopword = nlp.Defaults.stop_words
for key in list(fre_tokens.keys()):
    if key in nlp.Defaults.stop_words:
        del fre_tokens[key]

In [7]:
fre_tokens["noodle"]

{'Q1051265',
 'Q10892305',
 'Q15014447',
 'Q15932455',
 'Q17038368',
 'Q17052769',
 'Q192874',
 'Q19610131',
 'Q2221627',
 'Q28224412',
 'Q3275558',
 'Q391082',
 'Q47096879',
 'Q47149231',
 'Q4939235',
 'Q5299471',
 'Q61972328',
 'Q65547070',
 'Q67440705',
 'Q697498',
 'Q835336',
 'Q872828',
 'Q98768411',
 'Q98826752'}

In [8]:
idx_name["Q98826752"]

'noodle dish'

In [9]:
def greedy_compound(matches, nlp_sent):
    tokens = []
    sent = len(nlp_sent)*[False]
    for _, start, end in matches:
        sent[start:end] = [True]*(end-start)
    print(sent)
    
    hold = []
    for status, token in zip(sent, nlp_sent):
        if status:
            hold.append(token.lemma_)
        elif status or hold:
            tokens.append(" ".join(hold))
            hold = []
        elif not (status or hold):
            continue
    if hold:
        tokens.append(" ".join(hold))
    # tokens is greedy token
    return tokens

In [10]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"ADJ","OP":"*"},{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"}]
matcher.add("NOUN_COMPOUND", None, pattern)

#pattern = [{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"},{"POS":"ADP"},{"POS":{"IN": ["PROPN", "NOUN"]}, "OP":"+"}]
#matcher.add("NOUN_ADP_NOUN", None, pattern)

# Input sentence

In [269]:
# Put sentence here
input_sent = "I want to have Japanese noodle and pepperoni pizza""
sent_nlp= nlp(input_sent)
matches = matcher(sent_nlp)

SyntaxError: EOL while scanning string literal (<ipython-input-269-8ea48bfe482f>, line 2)

In [12]:
couple = greedy_compound(matches, sent_nlp)
couple

[False, False, False, False, True, True]


['japanese noodle']

In [13]:
def check_food(token, food_name, fre_tokens, threshod = 10):
    if token in food_name:
        return True
    
    words = token.split(" ")
    
    words_score = [fre_tokens.get(word,0) for word in words]
    if sum(words_score)/len(words_score)>threshod:
        return True
    
    return False

def hybrid_similarity(m ,n):
    similarity = rltk.levenshtein_similarity(m,n)
    
    if similarity > 0.7:
        similarity = 1
        return similarity
    else:
        return similarity

In [14]:
foods = []
for item in couple:
    if item in food_class_dict:
        foods.append(item)
        continue
    
    food_token = item.split(" ")
    
    candits = set()
    for token in food_token:
        if token in fre_tokens:
            for _ in fre_tokens[token]:
                candits.add(idx_name[_])
            
    highest = (0,"")
    for candit in candits:
        s1 = food_token
        s2 = candit.split(" ")
        similarity = rltk.hybrid_jaccard_similarity(set(s1), set(s2),function=hybrid_similarity)
        
        if similarity > highest[0]:
            highest = (similarity, candit)
            
    if highest[-1]:
        foods.append(highest[-1])

In [15]:
id_s = []

for food in foods:
    id_s.append(food_class_dict[food])

In [16]:
foods,id_s

(['japanese noodles'], ['Q17116319'])

In [17]:
food_class_dict["soft drink"]

'Q147538'

In [267]:
def search_food(id_):
    # search food by the id of class in food ontology 
    id_ = "wd:"+id_

    sparql = SPARQLWrapper("http://localhost:3030/food/query")
    sparql.setQuery(f"""
        PREFIX my_ns: <http://dsci558.org/myfakenamespace#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

        SELECT ?rest (group_concat(distinct(?name); separator=", ") as ?f)
        WHERE {{
        {{
        {{?food rdfs:subClassOf {id_} .}}
        UNION
        {{?food rdfs:subClassOf/rdfs:subClassOf {id_} .}}
        UNION
        {{?food rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf {id_} .}}
        UNION
        {{?food rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf {id_} .}}
        UNION
        {{?food rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf/rdfs:subClassOf {id_} .}}
        }}
        {{?rest my_ns:hasFood [a ?food;
        rdfs:label ?name]}}
        UNION
        {{?rest my_ns:hasFood [a {id_};
        rdfs:label ?name]}}
         }} group by ?rest order by desc(count(?name))
         limit 200
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    combine = [ (_["rest"]["value"],_["f"]["value"]) for _ in results["results"]["bindings"]]
    return combine
#uri_s = [_["rest"]["value"] for _ in results["results"]["bindings"]]

In [268]:
search_food("Q17116319")

[('https://www.tripadvisor.com/Restaurant_Review-g32655-d3800202-Reviews-or130-Tsujita_LA-Los_Angeles_California.html',
  'tsukemen, udon noodle, ramen, japanese noodle, japanese style noodle, japanese noodle bar'),
 ('https://www.tripadvisor.com/Restaurant_Review-g32655-d1022582-Reviews-or200-Daikokuya-Los_Angeles_California.html',
  'tsukemen, tonkotsu ramen, ramen, japanese noodle'),
 ('https://www.tripadvisor.com/Restaurant_Review-g32655-d3321000-Reviews-or10-Cha_Cha_Chili-Los_Angeles_California.html',
  'japanese noodle'),
 ('https://www.tripadvisor.com/Restaurant_Review-g32655-d4178694-Reviews-or130-Tatsu_Ramen-Los_Angeles_California.html',
  'japanese noodle'),
 ('https://www.tripadvisor.com/Restaurant_Review-g32655-d12655622-Reviews-Dosanko_Ramen-Los_Angeles_California.html',
  'deluxe shio butter raman, red miso raman, white miso raman'),
 ('https://www.tripadvisor.com/Restaurant_Review-g32655-d4019757-Reviews-or70-Marugame_Monzo-Los_Angeles_California.html',
  'hot sansai udo

In [251]:
def search_res_with_food(food_uri):

    sparql_ = SPARQLWrapper("http://localhost:3030/rest_hotel/query")
    sparql_.setQuery(f"""
        PREFIX my_ns: <http://dsci558.org/myfakenamespace#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

        SELECT ?name ?rate ?price ?location (max(?value) as ?v)
        {{ {food_uri} my_ns:name ?name ;
        my_ns:rating ?rate ;
        my_ns:price ?price ;
        my_ns:location ?location ;
        a my_ns:restaurant ;
        my_ns:hasFood [my_ns:name ?food_name;
        rdf:value ?value]
        FILTER (?food_name != "food")
        }}
        group by ?name ?rate ?price ?location limit 100
    """)
    sparql_.setReturnFormat(JSON)
    results = sparql_.query().convert()
    
    count = results["results"]["bindings"][0]["v"]["value"]
    
    sparql_.setQuery(f"""
        PREFIX my_ns: <http://dsci558.org/myfakenamespace#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

        SELECT ?food_name
        {{ {food_uri} 
        my_ns:hasFood [my_ns:name ?food_name;
        rdf:value {count}]
        }}
        limit 1
    """)
    
    sparql_.setReturnFormat(JSON)
    results2 = sparql_.query().convert()
    
    data = results["results"]["bindings"][0]
    data["food"] = results2["results"]["bindings"][0]["food_name"]
    
    res = dict()
    
    for item in data:
        res[item] = data[item]["value"]
    
    del res["v"]
    return res

In [252]:
search_res_with_food("<https://www.tripadvisor.com/Restaurant_Review-g32655-d348188-Reviews-or30-Guido_s-Los_Angeles_California.html>")

{'name': "Guido's",
 'rate': '4.0',
 'price': '$$ - $$$',
 'location': '14556 Polk St, Los Angeles, CA 90025',
 'food': 'cioppino'}

In [243]:
test = search_food('Q17116319')[0]

In [244]:
test

'http://www.wikidata.org/entity/Q11271930'

In [15]:
def search_res_with_food(food_uri):
    sparql = SPARQLWrapper("http://localhost:3030/rest_hotel/query")
    sparql.setQuery("""
        PREFIX my_ns: <http://dsci558.org/myfakenamespace#> 
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX schema: <https://schema.org/> 
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
        SELECT ?rest
        WHERE { ?rest a my_ns:restaurant ;
        my_ns:hasFood/m}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

results

{'head': {'vars': ['s', 'p', 'o']},
 'results': {'bindings': [{'s': {'type': 'bnode', 'value': 'b0'},
    'p': {'type': 'uri',
     'value': 'http://dsci558.org/myfakenamespace#hasSent'},
    'o': {'type': 'literal',
     'value': 'this area of la seems pretty safe with lots of folks walking around and touring.more downtown area little tokyo understandably has an abundance noodle bars.'}},
   {'s': {'type': 'bnode', 'value': 'b0'},
    'p': {'type': 'uri', 'value': 'http://dsci558.org/myfakenamespace#name'},
    'o': {'type': 'literal', 'value': 'abundance noodle bar'}},
   {'s': {'type': 'bnode', 'value': 'b0'},
    'p': {'type': 'uri',
     'value': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#value'},
    'o': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '1'}}]}}