# My project


In [1]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import os
import numpy as np
import random
import re
import operator

#NER
from transformers import pipeline, set_seed
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

import editdistance
import difflib
from difflib import SequenceMatcher


# Embeddings
from sklearn.metrics import pairwise_distances

# Crowd Data
from pandasql import sqldf

## 1. Dataset


### 1.1 Load the data

In [2]:
graph = rdflib.Graph()
graph.parse('./dataset/14_graph.nt', format='turtle')

<Graph identifier=Nac8fd2977d8d4f05b50b59a36d785c62 (<class 'rdflib.graph.Graph'>)>

### 1.2 Graph Statistics

In [3]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

IMDB = Namespace('https://www.imdb.com/name/')

In [4]:
# pre-saved P values for Wikidata movies graph
global P_values, Q_values
P_values = {
    'director':'P57',
    'cast':'P161',
    'producer':'P162',
    'genre':'P136',
    'character':'P674',
    'screenwriter':'P58',   
    'filming location':'P915',
    'IMDB Id':'P345',
    'image':'P18',
    'publication date': 'P577',
    'MPA film rating' : 'P1657',
    'logo image' : 'P154',
    'country of origin' : 'P495',
    'cast member': 'P161',
    'film editor' : 'P1040',
    'production designer': 'P2554',
    'costume designer' : 'P2515',
    'composer' : 'P86',
    'producer' : 'P162',
    'distributed by' : 'P750',
    'production company': 'P272',
    'box office' : 'P2142',
    'review score' : 'P444',
    'nominated for' : 'P1411',
    
    
    'sex or gender': 'P21',
    'country of citizenship' : 'P27',
    'name in native language':'P1559',
    'birth name' : 'P1477',
    'date of birth':'P569',
    'place of birth':'P19',
    'father':'P22',
    'mother':'P25',
    'sibling':'P3373',
    'spouse':'P26',
    'child':'P40',
    'occupation':'P106',
    
}

Q_values = {
    'fictional human':'Q15632617',
    'film':'Q11424',
    'human':'Q5',
#     'Wikidata property for items about films':'Q22965162',
#     'Wikidata property related to creative works' : 'Q18618644',
    'Wikidata property related to movies and television shows' : 'Q107395292',
    'Wikidata property for items about people' : 'Q18608871',
    
    
 
    
    
}

### 1.3 External Resource Statistics

In [5]:

top250 = set(open('../dataset/imdb-top-250.t').read().split('\n')) - {''}


### 1.4 Literal Statistics

In [6]:
roots = {
    WD['Q8242']:        'literature',
    WD['Q5']:           'human',
    WD['Q483394']:      'genre',
    WD['Q95074']:       'character',
    WD['Q11424']:       'film',
    WD['Q15416']:       'tv',
    WD['Q618779']:      'award',
    WD['Q27096213']:    'geographic',
    WD['Q43229']:       'organisation',
    WD['Q34770']:       'language',
    WD['Q7725310']:     'series',
    WD['Q47461344']:    'written work',
}

## 3. SPARQL query examples

P57 - director of film  <br>
P31 - instance of <br> 
subclass of (P279) <br

Q11424 - film <br>
animated feature film (Q29168811) <br>
anime film (Q20650540) <br>


In [7]:
def find_entity_given_label(entity_label, entity_type="none"):
    
    entity_label = "\"" + str(entity_label) + "\"@en"
    
    if entity_type == 'film':
        entity_type = 'Q11424'
    elif entity_type == 'human':
        entity_type = 'Q5'
    
  
    if entity_type == "none":
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {}
        }} """.format(entity_label)
    else:
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {} .
            ?entity wdt:P31/wdt:P279* wd:{} .
        }} """.format(entity_label, entity_type)
        
    
#     print(query_content)
    res =  list(graph.query(query_content))
    if len(res)>0:
        return res[0][0]
    else:
        return -1

print(find_entity_given_label("MPAA film rating") )


print(find_entity_given_label("Forrest Gump",'film') )
print(find_entity_given_label("Weathering with You",'Q11424') )
print(find_entity_given_label("director"))



http://www.wikidata.org/entity/P1657
http://www.wikidata.org/entity/Q134773
http://www.wikidata.org/entity/Q59692464
http://www.wikidata.org/prop/direct/P57


In [8]:
def query_something_about_movie(p_val, label):
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {{
     ?movie rdfs:label "{}"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:{} ?answer
    }} """.format(label, p_val)
    
    print(query_content)
    return list(graph.query(query_content))
      
a = query_something_about_movie(P_values['director'], 'Forrest Gump' )    
  
for i in a:
    print(i)
    

PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:P57 ?answer
    } 
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q187364'),)


In [9]:
def find_all_film_genres():
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer ?label WHERE {{
     ?answer rdfs:label ?label .
     ?answer wdt:P31 wd:Q201658 .
    }} """
    
#     print(query_content)
    return list(graph.query(query_content))
      
a = find_all_film_genres()    

print(len(a))
# name_list = [str(x[1]) for x in a]
# name_list
a

213


[(rdflib.term.URIRef('http://www.wikidata.org/entity/Q1108032'),
  rdflib.term.Literal('Mumblecore', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q11298267'),
  rdflib.term.Literal('gambling film', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q1190502'),
  rdflib.term.Literal('surrealist cinema', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q1740789'),
  rdflib.term.Literal('poliziotteschi', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q21858363'),
  rdflib.term.Literal('psychedelic film', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q25230421'),
  rdflib.term.Literal('Gothic romance film', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q3072024'),
  rdflib.term.Literal('autobiographical film', lang='en')),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q4075563'),
  rdflib.term.Literal('outlaw biker film', lang='en')),
 (rdflib.term.URIRef('http://www.wikid

In [10]:
def find_movies_of_genre(genre_qval, n=3):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer ?label ?rating 
    WHERE {{
     ?answer rdfs:label ?label .
     ?answer wdt:P136 wd:{} .
     ?answer ddis:rating ?rating .
    }} 
    
    """.format(genre_qval)
    
    search_res = list(graph.query(query_content))

      
    search_res.sort(key=lambda x: float(x[2]), reverse=True)
    return search_res[:n]
find_movies_of_genre('Q200092')

[(rdflib.term.URIRef('http://www.wikidata.org/entity/Q186341'),
  rdflib.term.Literal('The Shining', lang='en'),
  rdflib.term.Literal('8.4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal'))),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q210756'),
  rdflib.term.Literal('The Thing', lang='en'),
  rdflib.term.Literal('8.1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal'))),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q607179'),
  rdflib.term.Literal("Rosemary's Baby", lang='en'),
  rdflib.term.Literal('8.0', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))]

In [11]:
def get_label_of_Qval(q_val):
    
    query_content =  """PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {{
                         <{}> rdfs:label ?label .
                         
                        }} """.format(q_val)
    
    print(query_content)
    return list(graph.query(query_content))

a = get_label_of_Qval('http://www.wikidata.org/entity/Q187364')
  
for i in a:
    print(i)
    
    

PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {
                         <http://www.wikidata.org/entity/Q187364> rdfs:label ?label .
                         
                        } 
(rdflib.term.Literal('Robert Zemeckis', lang='en'),)


In [12]:
def find_something_about_an_entity(entity_URI, relation_URI):
    
   
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?res ?label WHERE {{
        <{}> <{}> ?res .
        OPTIONAL{{?res rdfs:label ?label .}}
        
    }} """.format(entity_URI, relation_URI)

    res =  list(graph.query(query_content))

    if len(res) > 0:
        return res[0][0], res[0][1] 
    else:
        return -1, -1
a, b = find_something_about_an_entity('http://www.wikidata.org/entity/Q134773','http://www.wikidata.org/prop/direct/P57') 
print(a)
print(b)
a, b = find_something_about_an_entity(WD['Q134773'],WDT['P57']) 
print(a)
print(b)
a, b = find_something_about_an_entity('http://www.wikidata.org/entity/Q1033016', WDT['P345'])
print(a)
print(b)

http://www.wikidata.org/entity/Q187364
Robert Zemeckis
http://www.wikidata.org/entity/Q187364
Robert Zemeckis
nm0000932
None


In [13]:
def write_list_to_file(list_name, file_name):
    with open(file_name, 'w', encoding="utf-8") as filehandle:
        for listitem in list_name:
            filehandle.write(f'{listitem}\n')
        
def read_list_from_file(file_name):
    res_list = []
    with open(file_name, 'r', encoding="utf-8") as filehandle:
        for line in filehandle:
            curr_place = line[:-1]
            res_list.append(curr_place)
    return res_list

def save_file_with_all_movies(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?movie rdfs:label ?label .
        ?movie wdt:P31/wdt:P279* wd:Q11424 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_movies("save_files/all_movies_list.txt")

movies_list = []
movies_list = read_list_from_file("save_files/all_movies_list.txt")
#24384
print(len(movies_list))
print(movies_list[:5])

27816
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive']


In [14]:
def save_file_with_all_humans(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?person rdfs:label ?label .
        ?person wdt:P31/wdt:P279* wd:Q5 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_humans("save_files/all_humans_list.txt")
humans_list = []
humans_list = read_list_from_file("save_files/all_humans_list.txt")
#100157
print(len(humans_list))
print(humans_list[:5])


100157
['Viktor Krištof', 'Yuji Nomi', 'Béatrice Thiriet', 'Oleg Kapanets', 'Ram Lee']


In [15]:
# Q107395292 - Our KG does not have these entities, but the actual Wikidata does, all_properties_list.json is result of this same query from wikidata
def save_file_with_all_movies_and_tv_shows_properties(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?label ?entity WHERE {{
            ?entity rdfs:label ?label .
            ?entity wdt:P31/wdt:P279* wd:Q107395292 .
        
        filter (lang(?label) = "en")

        }}
        LIMIT 20
        """
        


    res = list(graph.query(query_content))
    print(res)
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
        
# save_file_with_all_movies_and_tv_shows_properties("save_files/all_properties_list.txt")
# properties_list = []
# properties_list = read_list_from_file("save_files/all_properties_list.txt")
# #100157
# print(len(properties_list))
# print(properties_list[:5])

In [16]:
def string_similarity_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

def subtract_strings(input_str, substring):
    output_string = ""
    str_list = input_str.split(substring)
    for element in str_list:
        output_string += element
    return output_string

def find_closest_match_in_a_List(word, target_list):
    res = difflib.get_close_matches(word.lower(), [item.lower() for item in target_list], n=1, cutoff = 0.6)
    res_ind = -1
    
    if len(res)!=0:
        for i in range(len(target_list)):
            if (target_list[i].lower()) == res[0]:
                res_ind = i
                res = target_list[i]
    else:
        return -1
    
#     print(res)
#     print(res_ind)
    return {'res':res, 'res_ind':res_ind, 'score' :string_similarity_score(word, res) }
print(find_closest_match_in_a_List('BuffaloBill and the Indians', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', humans_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', movies_list))



# print(subtract_strings(questions_df[1]['query'],questions_df[1]['ner'][0]['word']))


{'res': "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'res_ind': 2, 'score': 0.6067415730337079}
{'res': 'Béatrice Thiriet', 'res_ind': 2, 'score': 0.9375}
{'res': 'Triple Threat', 'res_ind': 10650, 'score': 0.6206896551724138}


In [17]:
import json
  
# Opening JSON file
f = open('save_files/all_movie_properties_list.json')
  
# returns JSON object as 
# a dictionary
all_movie_properties_list = json.load(f)


for i,item in enumerate(all_movie_properties_list):
    all_movie_properties_list[i]['entity'] = subtract_strings(all_movie_properties_list[i]['entity'],'http://www.wikidata.org/entity/' )
    P_values[all_movie_properties_list[i]['label']] = all_movie_properties_list[i]['entity']

print(len(P_values))
print(len(all_movie_properties_list))
print(all_movie_properties_list[10] ) 

416
501
{'label': 'GECD film ID', 'entity': 'P3367'}


In [18]:
relation_list = list(P_values.keys())
final_relation_res = find_closest_match_in_a_List('MPA film rating', relation_list)
p_val = P_values[final_relation_res['res']]
rdflib.term.URIRef(WDT[p_val])

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1657')

In [19]:
find_entity_given_label('Forrest Gump', Q_values['film'])

rdflib.term.URIRef('http://www.wikidata.org/entity/Q134773')

# Processing question


In [54]:
sample_questions = ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?", "Can you recommend me some horror films?", "Who is the director of Spider-Man: Far from Home?"]

In [55]:
questions_df = [{"query": s, "type" : "", "entity":[]}for s in sample_questions]


## Pattern Matching

## Name Entity Recognition


In [56]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [57]:
ner_pipeline(sample_questions[16])

[{'entity_group': 'PER',
  'score': 0.99767727,
  'word': 'Christopher Nolan',
  'start': 25,
  'end': 43}]

In [58]:
tokenizer_POS = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
model_POS = model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

In [59]:


# Replace this with your own checkpoint"
pos_tagging_pipe = pipeline(
    "token-classification", model=model_POS, aggregation_strategy="simple", tokenizer = tokenizer_POS
)

# POS tagging for all of the questions
for i in range (len(questions_df)):
    questions_df[i]["pos"] = pos_tagging_pipe(questions_df[i]["query"])
    questions_df[i]["ner"] = ner_pipeline(questions_df[i]["query"])
    for j,ner_res in enumerate(questions_df[i]["ner"]):
        if ner_res['score'] < 0.55:
            print('deleted ' + str (ner_res['word']))
            del questions_df[i]["ner"][j]
            
questions_df[17]

deleted film


{'query': 'Can you recommend me some horror films?',
 'type': '',
 'entity': [],
 'pos': [{'entity_group': 'AUX',
   'score': 0.9989091,
   'word': 'can',
   'start': 0,
   'end': 3},
  {'entity_group': 'PRON',
   'score': 0.9995284,
   'word': 'you',
   'start': 4,
   'end': 7},
  {'entity_group': 'VERB',
   'score': 0.99945956,
   'word': 'recommend',
   'start': 8,
   'end': 17},
  {'entity_group': 'PRON',
   'score': 0.99959165,
   'word': 'me',
   'start': 18,
   'end': 20},
  {'entity_group': 'DET',
   'score': 0.9992525,
   'word': 'some',
   'start': 21,
   'end': 25},
  {'entity_group': 'NOUN',
   'score': 0.98111904,
   'word': 'horror films',
   'start': 26,
   'end': 38},
  {'entity_group': 'PUNCT',
   'score': 0.9996611,
   'word': '?',
   'start': 38,
   'end': 39}],
 'ner': []}

In [60]:
for i in questions_df:
    print(i['ner'])

[{'entity_group': 'MISC', 'score': 0.9957466, 'word': 'Good Will Hunting', 'start': 22, 'end': 40}]
[{'entity_group': 'LOC', 'score': 0.8970907, 'word': 'The Bridge on the River Kwai', 'start': 12, 'end': 41}]
[{'entity_group': 'MISC', 'score': 0.99650544, 'word': 'Star Wars: Episode VI - Return of the Jedi', 'start': 22, 'end': 65}]
[{'entity_group': 'MISC', 'score': 0.9925139, 'word': 'The Masked Gang: Cyprus', 'start': 26, 'end': 50}]
[{'entity_group': 'ORG', 'score': 0.9493264, 'word': 'MPAA', 'start': 11, 'end': 16}, {'entity_group': 'MISC', 'score': 0.96832675, 'word': 'Weathering with You?', 'start': 31, 'end': 52}]
[{'entity_group': 'MISC', 'score': 0.9957307, 'word': 'Good Neighbors', 'start': 20, 'end': 35}]
[{'entity_group': 'PER', 'score': 0.9893521, 'word': 'Halle Berry', 'start': 20, 'end': 32}]
[{'entity_group': 'PER', 'score': 0.99877614, 'word': 'Julia Roberts', 'start': 9, 'end': 23}]
[{'entity_group': 'PER', 'score': 0.9949758, 'word': 'Sandra Bullock', 'start': 16, 

In [61]:
find_closest_match_in_a_List('Weathering with You?', movies_list)
find_closest_match_in_a_List('MPAA', movies_list)

{'res': 'Paa', 'res_ind': 20448, 'score': 0.2857142857142857}

In [62]:
# Find the type of question by keyword matching
# Can use calssifier for this
def find_type(formulated_question_df):
    keywords_images = [ 'image', 'picture', 'look', 'looks' ]
    keywords_recommendation = ['similar', 'recommend', 'recommendations']
    res_type = ''
    query_list = []
    for i in formulated_question_df['pos']:
        query_list.append(i['word']) 
    if any(word in query_list for word in keywords_images):
        res_type = "images"
    elif any(word in query_list for word in keywords_recommendation):
        res_type = "recommendation"
    else :
        res_type = "search"
            
    return res_type

# Add the type for all questions
for i in range (len(questions_df)):
    questions_df[i]['type'] = find_type(questions_df[i])
    
#     print(questions_df[i]['query']) 
#     print(questions_df[i]['type']) 
#     print("______")

In [63]:
questions_df[17]

{'query': 'Can you recommend me some horror films?',
 'type': 'recommendation',
 'entity': [],
 'pos': [{'entity_group': 'AUX',
   'score': 0.9989091,
   'word': 'can',
   'start': 0,
   'end': 3},
  {'entity_group': 'PRON',
   'score': 0.9995284,
   'word': 'you',
   'start': 4,
   'end': 7},
  {'entity_group': 'VERB',
   'score': 0.99945956,
   'word': 'recommend',
   'start': 8,
   'end': 17},
  {'entity_group': 'PRON',
   'score': 0.99959165,
   'word': 'me',
   'start': 18,
   'end': 20},
  {'entity_group': 'DET',
   'score': 0.9992525,
   'word': 'some',
   'start': 21,
   'end': 25},
  {'entity_group': 'NOUN',
   'score': 0.98111904,
   'word': 'horror films',
   'start': 26,
   'end': 38},
  {'entity_group': 'PUNCT',
   'score': 0.9996611,
   'word': '?',
   'start': 38,
   'end': 39}],
 'ner': []}

In [64]:
# find the entity for a question of type images
def get_entities_from_nlp_results(formulated_question_df):
    
    res_entites = []
    
    # Look for Entity in NER results
    
    for i ,ner_res in enumerate(formulated_question_df['ner']):
        is_entity = False
        closest_match = {'res': '', 'res_ind': -1, 'score': 0}
        if ner_res['entity_group'] == 'MISC' or ner_res['entity_group'] == 'LOC' or ner_res['entity_group'] == 'ORG'or ner_res['entity_group'] == 'PER':
            
            # Find the best match in movies list
            
            movies_res = find_closest_match_in_a_List(str(ner_res['word']), movies_list)
            if movies_res != -1:                         
                if closest_match['score'] < movies_res['score']:
                    closest_match = movies_res
                    is_entity = True
            
            # Find the best match in humans list
            
            human_res = find_closest_match_in_a_List(str(ner_res['word']), humans_list)
            if human_res != -1:                              
                if closest_match['score'] < human_res['score']:
                    closest_match = human_res
                    is_entity = True
            
            # Find the best match in predicate list
            
            pred_res = find_closest_match_in_a_List(str(ner_res['word']), list(P_values.keys()))
            if pred_res != -1:                              
                if closest_match['score'] < pred_res['score']:
                    closest_match = pred_res
                    is_entity = False
            
                
        if is_entity and closest_match['score'] > 0.5:
            res_entites.append(ner_res['word'])
            
#             del formulated_question_df['ner'][i]
            is_entity = False            
    
    # Worst case scenario, no entities found
    if len(res_entites) == 0:
        
        potential_words = [ner_res['word'] for ner_res in formulated_question_df['ner'] ]
        for item in formulated_question_df['pos']:
            if item['entity_group'] in ['NOUN', 'PROPN']:
                potential_words.append(item['word'])
        
        # Type Search - means we are looking for a movie name 
        if formulated_question_df['type'] == 'search':
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the movie list
                movies_res = find_closest_match_in_a_List(str(pot_word), movies_list)
                if movies_res != -1:                         
                    if closest_match['score'] < movies_res['score']:
                        closest_match = movies_res
                        closest_match['res'] = pot_word
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
        
        # Type Image - means we are looking for a person
        elif formulated_question_df['type'] == 'image':
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the movie list
                movies_res = find_closest_match_in_a_List(str(pot_word), humans_list)
                if movies_res != -1:                         
                    if closest_match['score'] < movies_res['score']:
                        closest_match = movies_res
                        closest_match['res'] = pot_word
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
        
        elif formulated_question_df['type'] in ['recommendation', 'recommendation_genre']:
            genres = find_all_film_genres()
            genres_list = [str(x[1]) for x in genres]
            
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the genre list
                pot_word = pot_word.replace("movies", "")
                pot_word = pot_word.replace("Movies", "")
                pot_word = pot_word.replace("movie", "")
                pot_word = pot_word.replace("Movie", "")
                genre_res = find_closest_match_in_a_List(str(pot_word), genres_list)
                if genre_res != -1:                         
                    if closest_match['score'] < genre_res['score']:
                        closest_match = genre_res
                        
            
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
                formulated_question_df['type'] = 'recommendation_genre'
               
                
        
    
    # Handle entities split into 2
    
    if len(res_entites) > 1 and formulated_question_df['type'] != 'recommendation':
        concat = ''
        for item in res_entites:
            concat = concat + str(item)
        res_entites.append(concat)

        final_res = ''
        best_score = 0
        
#         print(res_entites)
        
        for i,res in enumerate(res_entites):
            temp = find_closest_match_in_a_List(res, movies_list)
            if temp != -1:
               # if scores are the similar, take the longer one
                if temp['score'] - best_score <= 0.08:
                    if len(temp['res']) > len(final_res):
                        final_res = res
                        
                        best_score = temp['score']  
                # if you find a better match for an entity, take that
                elif temp['score'] > best_score:
                    final_res = res
                    
                    best_score = temp['score'] 
                    
    
        
        formulated_question_df['entity'] = [final_res]
        
    else:
        
        formulated_question_df['entity'] = res_entites

    
    
    return formulated_question_df

for test_q_df in questions_df:
    test_q_df = get_entities_from_nlp_results(test_q_df)
    print(test_q_df['entity'])


['Good Will Hunting']
['The Bridge on the River Kwai']
['Star Wars: Episode VI - Return of the Jedi']
['The Masked Gang: Cyprus']
['Weathering with You?']
['Good Neighbors']
['Halle Berry']
['Julia Roberts']
['Sandra Bullock']
['Hamlet', 'Othello']
['The Lion King', 'Pocahontas', 'The Beauty and the Beast']
['Nightmare on Elm Street', 'Friday the 13th', 'Halloween']
['Tom Meets Zizou']
['X-Men: First Class']
['Batman']
['The Princess and the Frog']
['Christopher Nolan']
['horror film']
['Spider-Man: Far from Home']


In [65]:
def get_predicate_from_nlp(formulated_question):
    string_res = str(formulated_question['query'])
    
    # process on all of the words that are not entity
    
    for entity_res in formulated_question['entity']:
        string_res = subtract_strings(string_res, str(entity_res))
        
        
#     print(string_res)
    pos_res = pos_tagging_pipe(string_res)
#     print(pos_res)
    # process on all of the words that are not the following
    
    other_pos_tags = ['PUNCT', 'ADP', 'DET', 'AUX', 'PRON']
    res_list = []
    for item in pos_res:
        if (item['entity_group']) not in other_pos_tags:
            res_list.append( string_res[item['start']:item['end']])
    concat = ''
    if len(res_list) > 1:
        for item in res_list:
            concat = concat + ' ' + str(item)
        res_list.append(concat)
#     print(res_list) 
    final_res = ''
    best_score = 0
    for i,res in enumerate(res_list):
        temp = find_closest_match_in_a_List(res, list(P_values.keys()))
        if temp != -1 :
            if temp['score'] - best_score <= 0.08:
                if len(temp['res']) > len(final_res):
                    final_res = temp['res']
                    best_score = temp['score']
            
            elif temp['score'] > best_score:
                final_res = temp['res']
                best_score = temp['score'] 
    
        
    return final_res
        

for i in range (len(questions_df)):
    if questions_df[i]['type'] == 'search':
        res = get_predicate_from_nlp(questions_df[i])
        print(res)
# question = questions_df[2]
# print(question['query'])
# print(question['entity'])
# res = get_predicate_from_nlp(question)
# print(res)

director
director
director
screenwriter
MPA film rating
genre
publication date
executive producer
director
box office
birth name
director


In [66]:
find_something_about_an_entity('http://www.wikidata.org/entity/Q223596', 'http://www.wikidata.org/prop/direct/P1431')

(rdflib.term.URIRef('http://www.wikidata.org/entity/Q457180'),
 rdflib.term.Literal('Sheryl Lee Ralph', lang='en'))

In [67]:
def get_key_of_dict(val, my_dict):
    for key, value in my_dict.items():
        if val == value:
            return key
 
    return "key doesn't exist"

In [68]:
key = get_key_of_dict('P345', P_values)
key

'IMDB Id'

In [69]:
# main function to answer image questions
def handle_image_questions(formulated_question_df):
    if formulated_question_df['type']!='images':
        return -1

    # Can change to handle multiple entites but not necessary
    name = formulated_question_df['entity']
    if len(name)== -1:
        return -1
    name = name[0]
    
    final_entity_name = '' 
    best_score = 0
    movie_res = find_closest_match_in_a_List(name, movies_list)
    if movie_res != -1 and movie_res['score'] > best_score:
        final_entity_name = movie_res['res']
        best_score = movie_res['score']
        entity_type = 'film'
    human_res = find_closest_match_in_a_List(name, humans_list)
    if human_res != -1 and human_res['score'] > best_score:
        final_entity_name = human_res['res']
        best_score = human_res['score']
        entity_type = 'human'


    if len(final_entity_name) == 0:
        final_entity_name = name
        entity_type = 'none'
        entity_URI = find_entity_given_label(final_entity_name, entity_type)
    else:
        entity_URI = find_entity_given_label(final_entity_name, Q_values[entity_type])

    # Return Image
    Imdb_ID, _ =  find_something_about_an_entity(entity_URI, WDT['P345'])
    Imdb_URI = IMDB[Imdb_ID] 
    
    return Imdb_URI
    
for q in questions_df:
    if q['type']=='images':
        res = handle_image_questions(q)

        print(res)


https://www.imdb.com/name/nm0000932
https://www.imdb.com/name/nm0000210
https://www.imdb.com/name/nm0000113


In [70]:
a = graph.subject_objects(RDFS.label)


# Embeddings and Recommendations

In [71]:
def load_embeddings(graph):
    global entity_emb, relation_emb, id2ent, id2rel, label2ent, ent2id, ent2label, rel2id
    entity_emb = np.load(os.path.join('..', 'dataset','embeddings', 'entity_embeds.npy'))
    relation_emb = np.load(os.path.join('..', 'dataset','embeddings', 'relation_embeds.npy'))

    # load the dictionaries
    with open(os.path.join('..', 'dataset','embeddings', 'entity_ids.del'), 'r') as ifile:
        ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
        id2ent = {v: k for k, v in ent2id.items()}
    with open(os.path.join('..', 'dataset','embeddings', 'relation_ids.del'), 'r') as ifile:
        rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
        id2rel = {v: k for k, v in rel2id.items()}

    ent2label = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
    label2ent = {lbl: ent for ent, lbl in ent2label.items()}

def find_similar_entities_for_id(id, n = 4):

    entity = ent2id[WD[id]] 
    
    # compute distance with other entities
    dist = pairwise_distances(entity_emb[entity].reshape(1, -1), entity_emb).reshape(-1)
    best_matches = dist.argsort()

    best_matches = best_matches[1:20]
    best_matches = best_matches[1:n]
    
    res_list = []
    for i,idx in enumerate(best_matches):
        res_list.append(ent2label[id2ent[idx]])
        
    return res_list

In [72]:
load_embeddings(graph)

In [73]:
def find_similar_entities_for_multiple(entity_ids):

    entities = [ent2id[WD[id]] for id in entity_ids]
    
    entity_embs = np.array([entity_emb[entity].reshape(1, -1) for entity in entities ])
#     print((entity_embs.shape))
    sum_emb = entity_embs.sum(axis=0)
    avg_entity_embedding = sum_emb/entity_embs.shape[0]
#     print(entity_embs.shape[0])
#     print(avg_entity_embedding.shape)
#     print(entity_embs[0][0])
#     print(entity_embs[1][0])
#     print(avg_entity_embedding[0])
    dist = pairwise_distances(avg_entity_embedding, entity_emb).reshape(-1)
    
    best_matches = dist.argsort()

    best_matches = best_matches[1:20]
    
    res_list = []
    for i,idx in enumerate(best_matches):
        res_list.append(ent2label[id2ent[idx]])
        
    return res_list

Q164963 - two tower
Q102225 - HP Goblet
Q1199283 - HP Half blood
Q131074 - return of kings
Q471169 - Harry met sally

In [74]:

def handle_recommendation_questions(formulated_question, num_recommendations = 4):
    
    final_res_names = []
    question_entities = formulated_question['entity']
    
    if formulated_question['type'] == 'recommendation_genre':
        genre_name = question_entities[0]
        genres = find_all_film_genres()
        genres_list = [str(x[1]) for x in genres] 
        genre_res  = find_closest_match_in_a_List(genre_name, genres_list)
        genre_q_val = genres[genre_res['res_ind']]
        genre_q_val = genre_q_val[0][len(WD):]
        query_res = find_movies_of_genre(genre_q_val, n=num_recommendations)
        
        final_res_names = [str(res[1]) for res in query_res]
        
        
    else:

        movie_names = [movie for movie in question_entities ]

        for i,movie_name in enumerate(movie_names):

            res = find_closest_match_in_a_List(str(movie_name), movies_list)

            if res != -1:
                movie_names[i] = res['res']

        entity_names = [ find_entity_given_label(movie_name, 'film') for movie_name in movie_names]


        entity_q_list = [subtract_strings(entity, 'http://www.wikidata.org/entity/') for entity in entity_names]

        recommendation_label_list = find_similar_entities_for_multiple(entity_q_list)


        for item in recommendation_label_list:
            if item not in movie_names:
                final_res_names.append(item)
    
    if len(final_res_names) > num_recommendations:
        final_res_names = final_res_names[:num_recommendations]
    
    return final_res_names
    
for q in questions_df:
    if q['type'] in ['recommendation', 'recommendation_genre']:
        res = handle_recommendation_questions(q)
        print(q['query'])
        print(res)

Recommend movies similar to Hamlet and Othello.
['A Room with a View', 'Sense and Sensibility', 'The Tempest', 'Dancing at Lughnasa']
Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?
['Tarzan', 'Treasure Planet', 'Moana', 'Aladdin']
Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.
['Texas Chainsaw 3D', 'Final Destination 3', 'The First Purge', 'The Texas Chainsaw Massacre']
Can you recommend me some horror films?
['The Shining', 'The Thing', "Rosemary's Baby", 'Donnie Darko']


In [75]:

def check_embeddings_for_errors(entity,relation,query_res = -1):
    # "Jean Van Hamme" entity
    if WD[entity] in ent2id.keys():
        head = entity_emb[ent2id[WD[entity]]]
    else:
        return False, False
    # "occupation" relation
    if WDT[relation]in rel2id.keys():
        pred = relation_emb[rel2id[WDT[relation]]]
    else:
        return False, False
    
    # add vectors according to TransE scoring function.
    lhs = head + pred
    # compute distance to *any* entity
    dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
    # find most plausible entities
    most_likely = dist.argsort()
    # compute ranks of entities
    ranks = dist.argsort().argsort()
    if query_res != -1:
        res = pd.DataFrame([(str(lbl), dist[ent2id[WD[ent]]], ranks[ent2id[WD[ent]]]) for ent, lbl in query_res],
            columns=('Occupation', 'Score', 'Rank'))
    else:
        res = -1
    most_likely = pd.DataFrame([
        (id2ent[idx][len(WD):], ent2label[id2ent[idx]], dist[idx], rank+1)
        for rank, idx in enumerate(most_likely[:10])],
        columns=('Entity', 'Label', 'Score', 'Rank'))
    return res, most_likely

In [76]:

res, most_likely = check_embeddings_for_errors('Q181803', 'P57',{('Q471402','Richard Marquand')})
most_likely

Unnamed: 0,Entity,Label,Score,Rank
0,Q311319,Frank Oz,2904.539551,1
1,Q38222,George Lucas,2919.880371,2
2,Q240872,Lawrence Kasdan,2987.016113,3
3,Q1150882,Duwayne Dunham,2995.905762,4
4,Q3399754,Richard Driscoll,2996.198486,5
5,Q471402,Richard Marquand,3043.516602,6
6,Q203960,James Earl Jones,3047.783203,7
7,Q3020806,Debbie Lee Carrington,3059.332031,8
8,Q6848487,Mike Quinn,3061.999756,9
9,Q128379,David Prowse,3068.833496,10


In [77]:
def deal_with_KG_query(formulated_question):
    
    
    
    if formulated_question['type'] != 'search':
        return -1
    
    # Process Entity 
    if len(formulated_question['entity']) > 0:
        entity = formulated_question['entity'][0]
        
        closest_match = {'res': '', 'res_ind': -1, 'score': 0} 
        
        movies_res = find_closest_match_in_a_List(str(entity), movies_list)
        if movies_res != -1:                         
            if closest_match['score'] < movies_res['score']:
                closest_match = movies_res
                entity_type = 'film'

        human_res = find_closest_match_in_a_List(str(entity), humans_list)
        if human_res != -1:                              
            if closest_match['score'] < human_res['score']:
                closest_match = human_res
                entity_type = 'human'
        
        entity = closest_match['res']


    if entity_type == 'human' or entity_type == 'film':
        entity_URI = find_entity_given_label(entity, Q_values[entity_type])
    else:
        entity_URI = find_entity_given_label(entity, 'none')

    # Process Relation 
    relation = get_predicate_from_nlp(formulated_question)

    if relation == -1:
        return -1

    if relation in list(P_values.keys()):
        p_val = P_values[relation]
        relation_URI = rdflib.term.URIRef(WDT[p_val])
    else:
        relation_URI = find_entity_given_label(relation)
        if relation_URI == -1:
            return -1
#     print("entity = " + str(entity_URI))
#     print("relation = " +  str(relation_URI))
    query_res_URI, query_res_label =  find_something_about_an_entity(entity_URI, relation_URI)
#     print(query_res_URI)
    print('query_res_URI = ' + str(query_res_URI))
    if query_res_URI != -1:
        query_res_Q_val = query_res_URI[len(WD):]
#         print(query_res_Q_val)
        emb_res, emb_most_likely = check_embeddings_for_errors(entity_URI[len(WD):], relation_URI[len(WDT):],{(query_res_Q_val,query_res_label)})
    else:
        emb_res, emb_most_likely = check_embeddings_for_errors(entity_URI[len(WD):], relation_URI[len(WDT):],{('Q329737', 'butcher')})
        if not isinstance(emb_res, bool):
            return WD[emb_most_likely.loc[:,"Entity"].values[0]], emb_most_likely.loc[:,"Label"].values[0]
     
    # If entity or relation are not found in embeddings
    if isinstance(emb_res, bool):
        # Default return
        print('embedding not found')
        return query_res_URI, query_res_label
    
    rank = emb_res.loc[:,"Rank"].values[0]
    print('embedding rank = ' + str(rank))
    if rank <= 10:
        return query_res_URI, query_res_label
    else:
        return WD[emb_most_likely.loc[:,"Entity"].values[0]], emb_most_likely.loc[:,"Label"].values[0], 
        
        

    



    
for test_q_df in questions_df:
# test_q_df = questions_df[12]
    if test_q_df['type'] == 'search':
# print(test_q_df['entity'])
        print(test_q_df['query'])
        res = deal_with_KG_query(test_q_df)
        print(res)
        print("-------")

Who is the director of Good Will Hunting?
query_res_URI = http://www.wikidata.org/entity/Q25186
embedding rank = 2
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q25186'), rdflib.term.Literal('Gus Van Sant', lang='en'))
-------
Who directed The Bridge on the River Kwai?
query_res_URI = http://www.wikidata.org/entity/Q55260
embedding rank = 1
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q55260'), rdflib.term.Literal('David Lean', lang='en'))
-------
Who is the director of Star Wars: Episode VI - Return of the Jedi?
query_res_URI = http://www.wikidata.org/entity/Q471402
embedding rank = 5
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q471402'), rdflib.term.Literal('Richard Marquand', lang='en'))
-------
Who is the screenwriter of The Masked Gang: Cyprus?
query_res_URI = -1
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q5058838'), 'Cengiz Küçükayvaz')
-------
What is the MPAA film rating of Weathering with You?
query_res_URI = http://www.wikidata.org/entity/Q186653

In [78]:
check_embeddings_for_errors('Q59692464', 'P1657',{('Q18665349', 'NC-17')})

(  Occupation        Score  Rank
 0      NC-17  6555.905273     6,
       Entity                         Label        Score  Rank
 0  Q18665339                         PG-13  5618.707520     1
 1  Q18665334                            PG  6026.137207     2
 2  Q18665344                             R  6102.339355     3
 3  Q18665330                             G  6182.666504     4
 4  Q23660208  MPAA classification category  6464.782227     5
 5  Q59692464           Weathering with You  6538.968262     6
 6  Q18665349                         NC-17  6555.905273     7
 7  Q29836837     Dombey Street Productions  6569.221680     8
 8  Q15242622         Rectangle Productions  6598.762695     9
 9   Q2498180         Orthodox Encyclopedia  6599.468750    10)

In [79]:
global crowd_df
crowd_df = pd.read_csv((os.path.join('..', 'dataset','crowd_data', 'crowd_data.tsv')),sep='\t')
crowd_df['LifetimeApprovalRate'] = crowd_df['LifetimeApprovalRate'].str.rstrip("%").astype(int)
crowd_df.head()


Unnamed: 0,HITId,HITTypeId,Title,Reward,AssignmentId,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
0,1,7QT,Is this triple correct or incorrect?,$0.50,1,2133ICYWE97,Submitted,60,99,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,Is this triple correct or incorrect?,$0.50,2,2133U7HKDLO,Submitted,40,40,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
2,1,7QT,Is this triple correct or incorrect?,$0.50,3,928UJANWZ12,Submitted,50,98,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
3,1,7QT,Is this triple correct or incorrect?,$0.50,4,1726JMZQW,Submitted,80,70,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
4,1,7QT,Is this triple correct or incorrect?,$0.50,5,2134U7HKDMM,Submitted,2,70,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,


In [80]:
num_incorrect = crowd_df['AnswerLabel'].value_counts()['INCORRECT']
num_incorrect

146

In [81]:
def get_rid_of_malicious_workers():
    for index, row in crowd_df.iterrows(): 
        if ((row['LifetimeApprovalRate'] < 50 ) & (row['WorkTimeInSeconds'] < 5) ) :
            crowd_df.drop(index, inplace=True)

get_rid_of_malicious_workers()
print(crowd_df.shape)


(266, 16)


In [82]:
crowd_df['LifetimeApprovalRate'].min()

40

In [83]:
global distinct_question_pairs
q="""SELECT DISTINCT Input1ID, Input2ID, Input3ID FROM crowd_df;"""
pysqldf = lambda q: sqldf(q, globals())
distinct_question_pairs = pysqldf(q)
distinct_question_pairs

Unnamed: 0,Input1ID,Input2ID,Input3ID
0,wd:Q11621,wdt:P2142,792910554
1,wd:Q603545,wdt:P2142,4300000
2,wd:Q16911843,wdt:P577,2014-01-18
3,wd:Q132863,wdt:P2142,969023261
4,wd:Q1628022,wdt:P577,1951-01-01
...,...,...,...
56,wd:Q223596,wdt:P1431,wd:Q457180
57,wd:Q943992,wdt:P161,wd:Q160432
58,wd:Q1893555,wdt:P272,wd:Q48784114
59,wd:Q21060270,wdt:P27,wd:Q916


wd:Q1339195	ddis:indirectSubclassOf	wd:Q27096213

In [84]:
find_something_about_an_entity(WD['Q223596'], WDT['P1431'])

(rdflib.term.URIRef('http://www.wikidata.org/entity/Q457180'),
 rdflib.term.Literal('Sheryl Lee Ralph', lang='en'))

In [85]:
entity = 'wd:Q841233'
relation = 'wdt:P2142'
# df.loc[(df['col1'] == value) & (df['col2'] < value)]
# if crowd_df.loc[(crowd_df['Input1ID'] == entity) & (crowd_df['Input2ID'] == relation)] :
#     print("aha")
crowd_df.loc[(crowd_df['Input1ID'] == entity) & (crowd_df['Input2ID'] == relation)]

Unnamed: 0,HITId,HITTypeId,Title,Reward,AssignmentId,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
150,31,8QT,Is this triple correct or incorrect?,$0.50,151,AALKMII98,Submitted,240,98,wd:Q841233,wdt:P2142,10696220,1.0,CORRECT,,
151,31,8QT,Is this triple correct or incorrect?,$0.50,152,HHCKW1111,Submitted,200,80,wd:Q841233,wdt:P2142,10696220,1.0,CORRECT,,
152,31,8QT,Is this triple correct or incorrect?,$0.50,153,GGUI83657S,Submitted,120,85,wd:Q841233,wdt:P2142,10696220,2.0,INCORRECT,Object,10696210.0
153,31,8QT,Is this triple correct or incorrect?,$0.50,154,ZZHL098SA43,Submitted,2,69,wd:Q841233,wdt:P2142,10696220,1.0,CORRECT,,


In [86]:
def calculate_agreeability_in_crowd():
    score_list = []
    for index, row in distinct_question_pairs.iterrows():
        entity = row['Input1ID']
        relation = row['Input2ID']
        cor_count = 0
        inc_count = 0
        entiy_rel_rows = crowd_df.loc[(crowd_df['Input1ID'] == entity) & (crowd_df['Input2ID'] == relation)]
        for index, rows in entiy_rel_rows.iterrows():
            if rows['AnswerID'] == 2.0:
                inc_count += 1
            elif rows['AnswerID'] == 1.0:
                cor_count += 1
        score = cor_count/(cor_count + inc_count)
        score_list.append(score)
        distinct_question_pairs['score'] = score
#         print("Score for " + str(entity) + " and " + str(relation) + " = " + str(score))

    distinct_question_pairs['score'] = score_list

calculate_agreeability_in_crowd() 
distinct_question_pairs

Unnamed: 0,Input1ID,Input2ID,Input3ID,score
0,wd:Q11621,wdt:P2142,792910554,0.80
1,wd:Q603545,wdt:P2142,4300000,0.80
2,wd:Q16911843,wdt:P577,2014-01-18,0.60
3,wd:Q132863,wdt:P2142,969023261,0.40
4,wd:Q1628022,wdt:P577,1951-01-01,0.80
...,...,...,...,...
56,wd:Q223596,wdt:P1431,wd:Q457180,0.50
57,wd:Q943992,wdt:P161,wd:Q160432,0.25
58,wd:Q1893555,wdt:P272,wd:Q48784114,0.75
59,wd:Q21060270,wdt:P27,wd:Q916,0.50


In [87]:
# find_something_about_an_entity(entity_URI, relation_URI)
def process_crowd_data():
    for index, row in distinct_question_pairs.iterrows():
        # Proces all crowd data where the workers label the 
        if row['AnswerID'] == 2.0:
            entity = row['Input1ID']
            relation = row['Input2ID']
            crowd_res = row['Input3ID']
