# My project


In [95]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import os
import numpy as np

import re

#NER
from transformers import pipeline, set_seed
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

import editdistance
import difflib
from difflib import SequenceMatcher


## 1. Dataset


### 1.1 Load the data

In [2]:
graph = rdflib.Graph()
graph.parse('./dataset/14_graph.nt', format='turtle')

<Graph identifier=N7354d8ffbdfa49d79973a1c9d9b87217 (<class 'rdflib.graph.Graph'>)>

### 1.2 Graph Statistics

In [3]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [171]:
# pre-saved P values for Wikidata movies graph
global P_values, Q_values
P_values = {
    'director':'P57',
    'cast':'P161',
    'producer':'P162',
    'genre':'P136',
    'character':'P674',
    'screenwriter':'P58',   
    'filming location':'P915',
    'IMDB Id':'P345',
    'image':'P18',
    'publication date': 'P577',
    'MPA film rating' : 'P1657',
    'logo image' : 'P154',
    'country of origin' : 'P495',
    'cast member': 'P161',
    'film editor' : 'P1040',
    'production designer': 'P2554',
    'costume designer' : 'P2515',
    'composer' : 'P86',
    'producer' : 'P162',
    'distributed by' : 'P750',
    'production company': 'P272',
    'box office' : 'P2142',
    'review score' : 'P444',
    'nominated for' : 'P1411',
    
    
    'sex or gender': 'P21',
    'country of citizenship' : 'P27',
    'name in native language':'P1559',
    'birth name' : 'P1477',
    'date of birth':'P569',
    'place of birth':'P19',
    'father':'P22',
    'mother':'P25',
    'sibling':'P3373',
    'spouse':'P26',
    'child':'P40',
    'occupation':'P106',
    
}

Q_values = {
    'fictional human':'Q15632617',
    'film':'Q11424',
    'human':'Q5',
#     'Wikidata property for items about films':'Q22965162',
#     'Wikidata property related to creative works' : 'Q18618644',
    'Wikidata property related to movies and television shows' : 'Q107395292',
    'Wikidata property for items about people' : 'Q18608871',
    
    
 
    
    
}

### 1.3 External Resource Statistics

In [65]:

top250 = set(open('../dataset/imdb-top-250.t').read().split('\n')) - {''}


### 1.4 Literal Statistics

In [66]:
roots = {
    WD['Q8242']:        'literature',
    WD['Q5']:           'human',
    WD['Q483394']:      'genre',
    WD['Q95074']:       'character',
    WD['Q11424']:       'film',
    WD['Q15416']:       'tv',
    WD['Q618779']:      'award',
    WD['Q27096213']:    'geographic',
    WD['Q43229']:       'organisation',
    WD['Q34770']:       'language',
    WD['Q7725310']:     'series',
    WD['Q47461344']:    'written work',
}

## 3. SPARQL query examples

P57 - director of film  <br>
P31 - instance of <br> 
subclass of (P279) <br

Q11424 - film <br>
animated feature film (Q29168811) <br>
anime film (Q20650540) <br>


In [67]:
def find_entity_given_label(entity_label, entity_type="none"):
    
    entity_label = "\"" + str(entity_label) + "\"@en"
    
    
  
    if entity_type == "none":
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {}
        }} """.format(entity_label)
    else:
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {} .
            ?entity wdt:P31/wdt:P279* wd:{} .
        }} """.format(entity_label, entity_type)
        
    
#     print(query_content)
    res =  list(graph.query(query_content))
    if len(res)>0:
        return res[0][0]
    else:
        return -1

print(find_entity_given_label("MPAA film rating") )


print(find_entity_given_label("Forrest Gump",'Q11424') )
print(find_entity_given_label("Weathering with You",'Q11424') )
print(find_entity_given_label("director"))



http://www.wikidata.org/entity/P1657
http://www.wikidata.org/entity/Q134773
http://www.wikidata.org/entity/Q59692464
http://www.wikidata.org/prop/direct/P57


In [68]:
def query_something_about_movie(p_val, label):
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {{
     ?movie rdfs:label "{}"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:{} ?answer
    }} """.format(label, p_val)
    
    print(query_content)
    return list(graph.query(query_content))
      
a = query_something_about_movie(P_values['director'], 'Forrest Gump' )    
  
for i in a:
    print(i)
    

PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:P57 ?answer
    } 
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q187364'),)


In [69]:
def get_label_of_Qval(q_val):
    
    query_content =  """PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {{
                         <{}> rdfs:label ?label .
                         
                        }} """.format(q_val)
    
    print(query_content)
    return list(graph.query(query_content))

a = get_label_of_Qval('http://www.wikidata.org/entity/Q187364')
  
for i in a:
    print(i)
    
    

PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {
                         <http://www.wikidata.org/entity/Q187364> rdfs:label ?label .
                         
                        } 
(rdflib.term.Literal('Robert Zemeckis', lang='en'),)


In [70]:
def find_something_about_an_entity(entity_URI, relation_URI):
    
   
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?res WHERE {{
        <{}> <{}> ?res
        
    }} """.format(entity_URI, relation_URI)
    

    res =  list(graph.query(query_content))
    if len(res) > 0:
        return res[0][0]
    else:
        return -1
a = find_something_about_an_entity('http://www.wikidata.org/entity/Q134773','http://www.wikidata.org/prop/direct/P57') 

# for elements in a[0]:
#     print(elements)
print(a)

http://www.wikidata.org/entity/Q187364


In [74]:
def write_list_to_file(list_name, file_name):
    with open(file_name, 'w', encoding="utf-8") as filehandle:
        for listitem in list_name:
            filehandle.write(f'{listitem}\n')
        
def read_list_from_file(file_name):
    res_list = []
    with open(file_name, 'r', encoding="utf-8") as filehandle:
        for line in filehandle:
            curr_place = line[:-1]
            res_list.append(curr_place)
    return res_list

def save_file_with_all_movies(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?movie rdfs:label ?label .
        ?movie wdt:P31/wdt:P279* wd:Q11424 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_movies("save_files/all_movies_list.txt")

movies_list = []
movies_list = read_list_from_file("save_files/all_movies_list.txt")
#24384
print(len(movies_list))
print(movies_list[:5])

27816
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive']


In [75]:
def save_file_with_all_humans(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?person rdfs:label ?label .
        ?person wdt:P31/wdt:P279* wd:Q5 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_humans("save_files/all_humans_list.txt")
humans_list = []
humans_list = read_list_from_file("save_files/all_humans_list.txt")
#100157
print(len(humans_list))
print(humans_list[:5])


100157
['Viktor Krištof', 'Yuji Nomi', 'Béatrice Thiriet', 'Oleg Kapanets', 'Ram Lee']


In [182]:
# Q107395292 - Our KG does not have these entities, but the actual Wikidata does, all_properties_list.json is result of this same query from wikidata
def save_file_with_all_movies_and_tv_shows_properties(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?label ?entity WHERE {{
            ?entity rdfs:label ?label .
            ?entity wdt:P31/wdt:P279* wd:Q107395292 .
        
        filter (lang(?label) = "en")

        }}
        LIMIT 20
        """
        


    res = list(graph.query(query_content))
    print(res)
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
        
# save_file_with_all_movies_and_tv_shows_properties("save_files/all_properties_list.txt")
# properties_list = []
# properties_list = read_list_from_file("save_files/all_properties_list.txt")
# #100157
# print(len(properties_list))
# print(properties_list[:5])

501

In [98]:
def string_similarity_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

def subtract_strings(input_str, substring):
    output_string = ""
    str_list = input_str.split(substring)
    for element in str_list:
        output_string += element
    return output_string

def find_closest_match_in_a_List(word, target_list):
    res = difflib.get_close_matches(word.lower(), [item.lower() for item in target_list], n=1, cutoff = 0.6)
    res_ind = -1
    
    if len(res)!=0:
        for i in range(len(target_list)):
            if (target_list[i].lower()) == res[0]:
                res_ind = i
                res = target_list[i]
    else:
        return -1
    
#     print(res)
#     print(res_ind)
    return {'res':res, 'res_ind':res_ind, 'score' :string_similarity_score(word, res) }
print(find_closest_match_in_a_List('BuffaloBill and the Indians', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', humans_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', movies_list))



# print(subtract_strings(questions_df[1]['query'],questions_df[1]['ner'][0]['word']))


{'res': "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'res_ind': 2, 'score': 0.6067415730337079}
{'res': 'Béatrice Thiriet', 'res_ind': 2, 'score': 0.9375}
{'res': 'Triple Threat', 'res_ind': 10650, 'score': 0.6206896551724138}


In [181]:
import json
  
# Opening JSON file
f = open('all_movie_properties_list.json')
  
# returns JSON object as 
# a dictionary
all_movie_properties_list = json.load(f)

len(all_movie_properties_list)

for item in all_movie_properties_list:
    P_values[item['label']] = subtract_strings(item['entity'],'http://www.wikidata.org/entity/' )

'P345'

In [185]:
print(len(P_values))

{'director': 'P57', 'cast': 'P161', 'producer': 'P162', 'genre': 'P136', 'character': 'P674', 'screenwriter': 'P58', 'filming location': 'P915', 'IMDB Id': 'P345', 'image': 'P18', 'publication date': 'P577', 'MPA film rating': 'P1657', 'logo image': 'P154', 'country of origin': 'P495', 'cast member': 'P161', 'film editor': 'P1040', 'production designer': 'P2554', 'costume designer': 'P2515', 'composer': 'P86', 'distributed by': 'P750', 'production company': 'P272', 'box office': 'P2142', 'review score': 'P444', 'nominated for': 'P1411', 'sex or gender': 'P21', 'country of citizenship': 'P27', 'name in native language': 'P1559', 'birth name': 'P1477', 'date of birth': 'P569', 'place of birth': 'P19', 'father': 'P22', 'mother': 'P25', 'sibling': 'P3373', 'spouse': 'P26', 'child': 'P40', 'occupation': 'P106', 'test_key': 'test_val'}


In [166]:
relation_list = list(P_values.keys())
final_relation_res = find_closest_match_in_a_List('MPA film rating', relation_list)
p_val = P_values[final_relation_res['res']]
rdflib.term.URIRef(WDT[p_val])

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1657')

In [100]:
find_entity_given_label('Forrest Gump', Q_values['film'])

rdflib.term.URIRef('http://www.wikidata.org/entity/Q134773')

# Processing question


In [118]:
sample_questions = ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?"]

In [119]:
questions_df = [{"query": s, "type" : "", "entity":[]}for s in sample_questions]
questions_df

[{'query': 'Who is the director of Good Will Hunting?',
  'type': '',
  'entity': []},
 {'query': 'Who directed The Bridge on the River Kwai?',
  'type': '',
  'entity': []},
 {'query': 'Who is the director of Star Wars: Episode VI - Return of the Jedi?',
  'type': '',
  'entity': []},
 {'query': 'Who is the screenwriter of The Masked Gang: Cyprus?',
  'type': '',
  'entity': []},
 {'query': 'What is the MPAA film rating of Weathering with You?',
  'type': '',
  'entity': []},
 {'query': 'What is the genre of Good Neighbors?', 'type': '', 'entity': []},
 {'query': 'Show me a picture of Halle Berry.', 'type': '', 'entity': []},
 {'query': 'What does Julia Roberts look like?', 'type': '', 'entity': []},
 {'query': 'Let me know what Sandra Bullock looks like.',
  'type': '',
  'entity': []},
 {'query': 'Recommend movies similar to Hamlet and Othello.',
  'type': '',
  'entity': []},
 {'query': 'Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend so

## Pattern Matching

## Name Entity Recognition


In [120]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [121]:
ner_pipeline(sample_questions[15])

[{'entity_group': 'MISC',
  'score': 0.99671656,
  'word': 'The Princess and the Frog',
  'start': 25,
  'end': 51}]

In [122]:
fine_tune_data = []
for movie in movies_list:
    dummy = {"label":'MISC', 'word':movie}
    fine_tune_data.append(dummy)
    
fine_tune_data[100]   

{'label': 'MISC', 'word': 'The Man Who Copied'}

In [123]:
tokenizer_POS = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
model_POS = model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

In [124]:


# Replace this with your own checkpoint"
pos_tagging_pipe = pipeline(
    "token-classification", model=model_POS, aggregation_strategy="simple", tokenizer = tokenizer_POS
)

# POS tagging for all of the questions
for i in range (len(questions_df)):
    questions_df[i]["pos"] = pos_tagging_pipe(questions_df[i]["query"])
    questions_df[i]["ner"] = ner_pipeline(questions_df[i]["query"])
    for j,ner_res in enumerate(questions_df[i]["ner"]):
        if ner_res['score'] < 0.55:
            del questions_df[i]["ner"][j]
            print('deleted')
questions_df[4]

deleted


{'query': 'What is the MPAA film rating of Weathering with You?',
 'type': '',
 'entity': [],
 'pos': [{'entity_group': 'PRON',
   'score': 0.9991217,
   'word': 'what',
   'start': 0,
   'end': 4},
  {'entity_group': 'AUX',
   'score': 0.99638736,
   'word': 'is',
   'start': 5,
   'end': 7},
  {'entity_group': 'DET',
   'score': 0.99949145,
   'word': 'the',
   'start': 8,
   'end': 11},
  {'entity_group': 'PROPN',
   'score': 0.9596777,
   'word': 'mpaa',
   'start': 12,
   'end': 16},
  {'entity_group': 'NOUN',
   'score': 0.9975196,
   'word': 'film rating',
   'start': 17,
   'end': 28},
  {'entity_group': 'ADP',
   'score': 0.93028975,
   'word': 'of',
   'start': 29,
   'end': 31},
  {'entity_group': 'VERB',
   'score': 0.958319,
   'word': 'weathering',
   'start': 32,
   'end': 42},
  {'entity_group': 'ADP',
   'score': 0.9970561,
   'word': 'with',
   'start': 43,
   'end': 47},
  {'entity_group': 'PRON',
   'score': 0.99925786,
   'word': 'you',
   'start': 48,
   'end': 51

In [125]:
for i in questions_df:
    print(i['ner'])

[{'entity_group': 'MISC', 'score': 0.9957466, 'word': 'Good Will Hunting', 'start': 22, 'end': 40}]
[{'entity_group': 'LOC', 'score': 0.8970907, 'word': 'The Bridge on the River Kwai', 'start': 12, 'end': 41}]
[{'entity_group': 'MISC', 'score': 0.99650544, 'word': 'Star Wars: Episode VI - Return of the Jedi', 'start': 22, 'end': 65}]
[{'entity_group': 'MISC', 'score': 0.9925139, 'word': 'The Masked Gang: Cyprus', 'start': 26, 'end': 50}]
[{'entity_group': 'ORG', 'score': 0.9493264, 'word': 'MPAA', 'start': 11, 'end': 16}, {'entity_group': 'MISC', 'score': 0.96832675, 'word': 'Weathering with You?', 'start': 31, 'end': 52}]
[{'entity_group': 'MISC', 'score': 0.9957307, 'word': 'Good Neighbors', 'start': 20, 'end': 35}]
[{'entity_group': 'PER', 'score': 0.9893521, 'word': 'Halle Berry', 'start': 20, 'end': 32}]
[{'entity_group': 'PER', 'score': 0.99877614, 'word': 'Julia Roberts', 'start': 9, 'end': 23}]
[{'entity_group': 'PER', 'score': 0.9949758, 'word': 'Sandra Bullock', 'start': 16, 

In [126]:
find_closest_match_in_a_List('Weathering with You?', movies_list)
find_closest_match_in_a_List('MPAA', movies_list)

{'res': 'Paa', 'res_ind': 20448, 'score': 0.2857142857142857}

In [127]:
# Find the type of question by keyword matching
# Can use calssifier for this
def find_type(formulated_question_df):
    keywords_images = [ 'image', 'picture', 'look', 'looks' ]
    keywords_recommendation = ['similar', 'recommend', 'recommendations']
    res_type = ''
    query_list = []
    for i in formulated_question_df['pos']:
        query_list.append(i['word']) 
    if any(word in query_list for word in keywords_images):
        res_type = "images"
    elif any(word in query_list for word in keywords_recommendation):
        res_type = "recommendation"
    else :
        res_type = "search"
            
    return res_type

# Add the type for all questions
for i in range (len(questions_df)):
    questions_df[i]['type'] = find_type(questions_df[i])
    
#     print(questions_df[i]['query']) 
#     print(questions_df[i]['type']) 
#     print("______")

In [128]:
questions_df[4]

{'query': 'What is the MPAA film rating of Weathering with You?',
 'type': 'search',
 'entity': [],
 'pos': [{'entity_group': 'PRON',
   'score': 0.9991217,
   'word': 'what',
   'start': 0,
   'end': 4},
  {'entity_group': 'AUX',
   'score': 0.99638736,
   'word': 'is',
   'start': 5,
   'end': 7},
  {'entity_group': 'DET',
   'score': 0.99949145,
   'word': 'the',
   'start': 8,
   'end': 11},
  {'entity_group': 'PROPN',
   'score': 0.9596777,
   'word': 'mpaa',
   'start': 12,
   'end': 16},
  {'entity_group': 'NOUN',
   'score': 0.9975196,
   'word': 'film rating',
   'start': 17,
   'end': 28},
  {'entity_group': 'ADP',
   'score': 0.93028975,
   'word': 'of',
   'start': 29,
   'end': 31},
  {'entity_group': 'VERB',
   'score': 0.958319,
   'word': 'weathering',
   'start': 32,
   'end': 42},
  {'entity_group': 'ADP',
   'score': 0.9970561,
   'word': 'with',
   'start': 43,
   'end': 47},
  {'entity_group': 'PRON',
   'score': 0.99925786,
   'word': 'you',
   'start': 48,
   'en

In [129]:
# find the entity for a question of type images
def get_entities_from_nlp_results(formulated_question_df):

    for i ,ner_res in enumerate(formulated_question_df['ner']):
        is_entity = False
        closest_match = {'res': '', 'res_ind': -1, 'score': 0}
        if ner_res['entity_group'] == 'MISC' or ner_res['entity_group'] == 'LOC' or ner_res['entity_group'] == 'ORG'or ner_res['entity_group'] == 'PER':
            movies_res = find_closest_match_in_a_List(str(ner_res['word']), movies_list)
            if movies_res != -1:                         
                if closest_match['score'] < movies_res['score']:
                    closest_match = movies_res
                    is_entity = True
                
            human_res = find_closest_match_in_a_List(str(ner_res['word']), humans_list)
            if human_res != -1:                              
                if closest_match['score'] < human_res['score']:
                    closest_match = human_res
                    is_entity = True
            
            pred_res = find_closest_match_in_a_List(str(ner_res['word']), list(P_values.keys()))
            if pred_res != -1:                              
                if closest_match['score'] < pred_res['score']:
                    closest_match = pred_res
                    is_entity = False
            
                
        if is_entity and closest_match['score'] > 0.5:
            formulated_question_df['entity'].append(ner_res['word'])
#             del formulated_question_df['ner'][i]
            is_entity = False            
            
    if len(formulated_question_df['entity']) ==0:
        return -1
    
    return formulated_question_df

for test_q_df in questions_df:
    test_q_df = get_entities_from_nlp_results(test_q_df)
    print(test_q_df['entity'])

['Good Will Hunting']
['The Bridge on the River Kwai']
['Star Wars: Episode VI - Return of the Jedi']
['The Masked Gang: Cyprus']
['Weathering with You?']
['Good Neighbors']
['Halle Berry']
['Julia Roberts']
['Sandra Bullock']
['Hamlet', 'Othello']
['The Lion King', 'Pocahontas', 'The Beauty and the Beast']
['Nightmare on Elm Street', 'Friday the 13th', 'Halloween']
['Tom Meets Zizou']
['X-Men', ': First Class']
['Batman']
['The Princess and the Frog']
['Christopher Nolan']


In [140]:
res = find_closest_match_in_a_List('X-Men: First Class', movies_list)
res

{'res': 'X-Men: First Class', 'res_ind': 24226, 'score': 1.0}

In [144]:
def get_predicate_from_nlp(formulated_question):
    string_res = str(formulated_question['query'])
    
    # process on all of the words that are not entity
    
    for entity_res in formulated_question['entity']:
        string_res = subtract_strings(string_res, str(entity_res))
        
        
    print(string_res)
    pos_res = pos_tagging_pipe(string_res)
    
    # process on all of the words that are not the following
    
    other_pos_tags = ['PUNCT', 'ADP', 'ADJ', 'DET', 'AUX', 'PRON']
    res_list = []
    for item in pos_res:
        if (item['entity_group']) not in other_pos_tags:
            res_list.append( item['word'])
    
    temp_list = []
    ind_list = []
    for i,res in enumerate(res_list):
        temp = find_closest_match_in_a_List(res, list(P_values.keys()))
        if temp != -1:
            temp_list.append(temp)
            ind_list.append(i)
    
    final_res = -1
    if len(temp_list) != 0 :
        final_res = temp_list[0]['res']  
        
    return final_res
        

for i in range (len(questions_df)):
    if questions_df[i]['type'] == 'search':
        res = get_predicate_from_nlp(questions_df[i])
        print(res)

Who is the director of ?
director
Who directed ?
director
Who is the director of ?
director
Who is the screenwriter of ?
screenwriter
What is the MPAA film rating of 
MPA film rating
What is the genre of ?
genre
Can you tell me the publication date of ?
publication date
Who is the executive producer of ?
producer
Who is the director of  1989?
director
What is the box office of ?
box office
What is the birthplace of ?
birth name


In [151]:
find_closest_match_in_a_List('placebirth', list(P_values.keys()))

{'res': 'place of birth', 'res_ind': 28, 'score': 0.8333333333333334}

Who directed ?


In [170]:
def deal_with_KG_query(entity, relation):
    
    # Process Entity 
    
    match_list = []
    entity_type = 'none'
    movie_res = find_closest_match_in_a_List(entity, movies_list)
    if movie_res != -1:
        match_list.append(movie_res['res'])
    human_res = find_closest_match_in_a_List(entity, humans_list)
    if human_res != -1:
        match_list.append(human_res['res'])

    final_entity_res = find_closest_match_in_a_List(entity, match_list)
    
    if final_entity_res == -1:
        final_entity_res = {'res':entity, 'res_ind' : -1}
        
    else:
        if human_res != -1 and (final_entity_res['res'] == human_res['res']):
            entity_type = 'human'
        elif movie_res != -1 and (final_entity_res['res'] == movie_res['res']):
            entity_type = 'film'
        
#     print(final_entity_res['res'])
#     print(entity_type)
    entity_URI = find_entity_given_label(final_entity_res['res'], Q_values[entity_type])
    
    
    # Process Relation 
    
    entity_type = "none"
    relation_list = list(P_values.keys())
    
    final_relation_res = find_closest_match_in_a_List(relation, relation_list)
    if final_relation_res == -1:
        final_relation_res = {'res':relation, 'res_ind' : -1,}
#     print(final_relation_res['res'])
#     print(entity_type)
    if final_relation_res['res_ind'] != -1:
        p_val = P_values[final_relation_res['res']]
        
        relation_URI = rdflib.term.URIRef(WDT[p_val])
    else:
        
        relation_URI = find_entity_given_label(final_relation_res['res'])
    
#     print(relation_URI)
#     print(entity_URI)
    
    return find_something_about_an_entity(entity_URI, relation_URI)
    
    
    
 
# res = deal_with_KG_query('Forest Gump','directed')
# print(res)

# res = deal_with_KG_query('Tom Meets Zizou','tell publication date')
# print(res)

res = deal_with_KG_query('Good Neighbors','genre')
print(res)

res = deal_with_KG_query('Weathering with You','MPA film rating')
print(res)

res = deal_with_KG_query('The Masked Gang: Cyprus','screenwriter ')
print(res)



http://www.wikidata.org/entity/Q1135802
http://www.wikidata.org/entity/Q18665349
-1


In [34]:
# main function to answer image questions
def handle_image_questions(formulated_question_df):
    if formulated_question_df['type']!='images':
        return -1
    else:
        # Can change to handle multiple entites but not necessary
        name = get_entities_from_nlp_results(formulated_question_df)
        if name == -1:
            return -1
        name = name[0]
       
        
    match_list = []
    entity_type = 'none'
    movie_res = find_closest_match_in_a_List(name, movies_list)
    if movie_res != -1:
        match_list.append(movie_res['res'])
    human_res = find_closest_match_in_a_List(name, humans_list)
    if human_res != -1:
        match_list.append(human_res['res'])

    final_entity_res = find_closest_match_in_a_List(name, match_list)
    
    if final_entity_res == -1:
        final_entity_res = {'res':entity, 'res_ind' : -1}
        
    else:
        if (final_entity_res['res'] == human_res['res']):
            entity_type = 'human'
        elif (final_entity_res['res'] == movie_res['res']):
            entity_type = 'film'
#     print(final_entity_res['res'])
    entity_URI = find_entity_given_label(final_entity_res['res'], Q_values[entity_type])
    
#     print(entity_URI)
    
    # Return Image
    return find_something_about_an_entity(entity_URI, WDT['P18'])
    
    
for q in questions_df:
    res = handle_image_questions(q)
    
    print(res)


-1
-1
-1
-1
-1
-1
https://commons.wikimedia.org/wiki/File:Halle_Berry_by_Gage_Skidmore_2.jpg
https://commons.wikimedia.org/wiki/File:Julia_Roberts_(43838880775).jpg
https://commons.wikimedia.org/wiki/File:Sandra_Bullock,_The_Heat,_London,_2013_(crop).jpg
-1
-1
-1
-1
-1
-1
-1
-1
