# My project


In [1]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from rdflib import Literal

from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import os
import numpy as np
import random
import re
import operator

#NER
from transformers import pipeline, set_seed
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

from transformers import pipeline

import editdistance
import difflib
from difflib import SequenceMatcher


# Embeddings
from sklearn.metrics import pairwise_distances

# Crowd Data
from pandasql import sqldf

## 1. Dataset


### 1.1 Load the graph data

In [2]:
global graph, WD, WDT, SCHEMA, DDIS, IMDB
def init_graph_data():
    global graph, WD, WDT, SCHEMA, DDIS, IMDB
    graph = rdflib.Graph()
    graph.parse('./dataset/14_graph.nt', format='turtle')
    # prefixes used in the graph
    WD = Namespace('http://www.wikidata.org/entity/')
    WDT = Namespace('http://www.wikidata.org/prop/direct/')
    SCHEMA = Namespace('http://schema.org/')
    DDIS = Namespace('http://ddis.ch/atai/')

    IMDB = Namespace('https://www.imdb.com/name/')
init_graph_data()

In [3]:
global graph
print(len(graph))


2056777


### 1.2 Graph Statistics

In [4]:
# pre-saved P values for Wikidata movies graph
global P_values, Q_values
P_values = {
    'director':'P57',
    'cast':'P161',
    'producer':'P162',
    'genre':'P136',
    'character':'P674',
    'screenwriter':'P58',   
    'filming location':'P915',
    'IMDB Id':'P345',
    'image':'P18',
    'publication date': 'P577',
    'MPA film rating' : 'P1657',
    'logo image' : 'P154',
    'country of origin' : 'P495',
    'cast member': 'P161',
    'film editor' : 'P1040',
    'production designer': 'P2554',
    'costume designer' : 'P2515',
    'composer' : 'P86',
    'producer' : 'P162',
    'distributed by' : 'P750',
    'production company': 'P272',
    'box office' : 'P2142',
    'review score' : 'P444',
    'nominated for' : 'P1411',
    
    'relative' : 'P1038',
    'nominated for': 'P1411',
    'sex or gender': 'P21',
    'country of citizenship' : 'P27',
    'name in native language':'P1559',
    'birth name' : 'P1477',
    'date of birth':'P569',
    'place of birth':'P19',
    'father':'P22',
    'mother':'P25',
    'sibling':'P3373',
    'spouse':'P26',
    'child':'P40',
    'occupation':'P106',
    'languages spoken':'P1412',
    'award received' : 'P166',
    'country of citizenship' : 'P27',
    'sibling':'P3373',
    'educated at' : 'P69',
    
    
}

Q_values = {
    'fictional human':'Q15632617',
    'film':'Q11424',
    'human':'Q5',
#     'Wikidata property for items about films':'Q22965162',
#     'Wikidata property related to creative works' : 'Q18618644',
    'Wikidata property related to movies and television shows' : 'Q107395292',
    'Wikidata property for items about people' : 'Q18608871',
   
    
}

## SPARQL query functions


P57 - director of film  <br>
P31 - instance of <br> 
subclass of (P279) <br

Q11424 - film <br>
animated feature film (Q29168811) <br>
anime film (Q20650540) <br>


In [5]:
def find_entity_given_label(entity_label, entity_type="none"):
    global graph
    entity_label = "\"" + str(entity_label) + "\"@en"
    
    if entity_type == 'film':
        entity_type = 'Q11424'
    elif entity_type == 'human':
        entity_type = 'Q5'
    
  
    if entity_type == "none":
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {}
        }} """.format(entity_label)
    else:
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {} .
            ?entity wdt:P31/wdt:P279* wd:{} .
        }} """.format(entity_label, entity_type)
        
    
#     print(query_content)
    res =  list(graph.query(query_content))
    if len(res)>0:
        return res[0][0]
    else:
        return -1

# Test
    
print(find_entity_given_label("MPAA film rating") )


print(find_entity_given_label("Forrest Gump",'film') )
print(find_entity_given_label("Weathering with You",'Q11424') )
print(find_entity_given_label("director"))



http://www.wikidata.org/entity/P1657
http://www.wikidata.org/entity/Q134773
http://www.wikidata.org/entity/Q59692464
http://www.wikidata.org/prop/direct/P57


In [6]:
def query_something_about_movie(p_val, label):
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {{
     ?movie rdfs:label "{}"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:{} ?answer
    }} """.format(label, p_val)
    
    print(query_content)
    return list(graph.query(query_content))
      
a = query_something_about_movie(P_values['director'], 'Forrest Gump' )    
  
for i in a:
    print(i)
    

PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wdt:P31/wdt:P279* wd:Q11424 .
     ?movie wdt:P57 ?answer
    } 
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q187364'),)


In [7]:
def find_all_film_genres():
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer ?label WHERE {{
     ?answer rdfs:label ?label .
     ?answer wdt:P31 wd:Q201658 .
    }} """
    
#     print(query_content)
    return list(graph.query(query_content))
      
a = find_all_film_genres()    

print(len(a))
# name_list = [str(x[1]) for x in a]
# name_list


213


In [8]:
def find_movies_of_genre_by_rating(genre_qval, n=3):
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer ?label ?rating 
    WHERE {{
     ?answer rdfs:label ?label .
     ?answer wdt:P136 wd:{} .
     ?answer ddis:rating ?rating .
    }} 
    
    """.format(genre_qval)
    
    search_res = list(graph.query(query_content))
    
    if len(search_res) == 0:
        return -1
    if len(search_res) > 0:  
        search_res.sort(key=lambda x: float(x[2]), reverse=True)
        return search_res[:n]
find_movies_of_genre_by_rating('Q200092')

[(rdflib.term.URIRef('http://www.wikidata.org/entity/Q186341'),
  rdflib.term.Literal('The Shining', lang='en'),
  rdflib.term.Literal('8.4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal'))),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q210756'),
  rdflib.term.Literal('The Thing', lang='en'),
  rdflib.term.Literal('8.1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal'))),
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q607179'),
  rdflib.term.Literal("Rosemary's Baby", lang='en'),
  rdflib.term.Literal('8.0', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))]

In [9]:
def get_rating_of_movie(label):
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?rating 
    WHERE {{
     ?answer rdfs:label "{}"@en .
     ?answer wdt:P31/wdt:P279* wd:Q11424 .
     ?answer ddis:rating ?rating .
    }} 
    
    """.format(label)
    
    search_res = list(graph.query(query_content))

    return search_res

In [10]:
get_rating_of_movie("Bulletproof")

[(rdflib.term.Literal('5.8', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')),),
 (rdflib.term.Literal('5.0', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')),)]

In [11]:
def get_label_of_Qval(q_val):
    global graph
    query_content =  """PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {{
                         <{}> rdfs:label ?label .
                         
                        }} """.format(q_val)
    
    print(query_content)
    return list(graph.query(query_content))

a = get_label_of_Qval('http://www.wikidata.org/entity/Q187364')
  
for i in a:
    print(i)
    
    

PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {
                         <http://www.wikidata.org/entity/Q187364> rdfs:label ?label .
                         
                        } 
(rdflib.term.Literal('Robert Zemeckis', lang='en'),)


In [12]:
def find_something_about_an_entity(entity_URI, relation_URI):
    global graph
   
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?res ?label WHERE {{
        <{}> <{}> ?res .
        OPTIONAL{{?res rdfs:label ?label .}}
        
    }} """.format(entity_URI, relation_URI)

    res =  list(graph.query(query_content))

    if len(res) > 0:
        return res[0][0], res[0][1] 
    else:
        return -1, -1
a, b = find_something_about_an_entity('http://www.wikidata.org/entity/Q134773','http://www.wikidata.org/prop/direct/P57') 
print(a)
print(b)
a, b = find_something_about_an_entity(WD['Q134773'],WDT['P57']) 
print(a)
print(b)
a, b = find_something_about_an_entity('http://www.wikidata.org/entity/Q1033016', WDT['P345'])
print(a)
print(b)

http://www.wikidata.org/entity/Q187364
Robert Zemeckis
http://www.wikidata.org/entity/Q187364
Robert Zemeckis
nm0000932
None


In [13]:
def write_list_to_file(list_name, file_name):
    with open(file_name, 'w', encoding="utf-8") as filehandle:
        for listitem in list_name:
            filehandle.write(f'{listitem}\n')
        
def read_list_from_file(file_name):
    res_list = []
    with open(file_name, 'r', encoding="utf-8") as filehandle:
        for line in filehandle:
            curr_place = line[:-1]
            res_list.append(curr_place)
    return res_list

def save_file_with_all_movies(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?movie rdfs:label ?label .
        ?movie wdt:P31/wdt:P279* wd:Q11424 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_movies("save_files/all_movies_list.txt")

movies_list = []
movies_list = read_list_from_file("save_files/all_movies_list.txt")
#24384
print(len(movies_list))
print(movies_list[:5])

27816
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive']


In [14]:
def save_file_with_all_humans(write_file_pathname):
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?person rdfs:label ?label .
        ?person wdt:P31/wdt:P279* wd:Q5 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
save_file_with_all_humans("save_files/all_humans_list.txt")
humans_list = []
humans_list = read_list_from_file("save_files/all_humans_list.txt")
#100157
print(len(humans_list))
print(humans_list[:5])


100157
['Viktor Krištof', 'Yuji Nomi', 'Béatrice Thiriet', 'Oleg Kapanets', 'Ram Lee']


In [15]:
# Q107395292 - Our KG does not have these entities, but the actual Wikidata does, all_properties_list.json is result of this same query from wikidata
def save_file_with_all_movies_and_tv_shows_properties(write_file_pathname):
    global graph
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?label ?entity WHERE {{
            ?entity rdfs:label ?label .
            ?entity wdt:P31/wdt:P279* wd:Q107395292 .
        
        filter (lang(?label) = "en")

        }}
        LIMIT 20
        """
        


    res = list(graph.query(query_content))
    print(res)
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
        


In [16]:
def string_similarity_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

def subtract_strings(input_str, substring):
    output_string = ""
    str_list = input_str.split(substring)
    for element in str_list:
        output_string += element
    return output_string

def find_closest_match_in_a_List(word, target_list):
    res = difflib.get_close_matches(word.lower(), [item.lower() for item in target_list], n=1, cutoff = 0.6)
    res_ind = -1
 
    if len(res)!=0:
        for i in range(len(target_list)):
            if (target_list[i].lower()) == res[0]:
                res_ind = i
                res = target_list[i]
        if (string_similarity_score(word, res)) < 0.7:
            return -1
    else:
        return -1
    
#     print(res)
#     print(res_ind)
    return {'res':res, 'res_ind':res_ind, 'score' :string_similarity_score(word, res) }
print(find_closest_match_in_a_List('Spiderman far from home', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', humans_list))
print(find_closest_match_in_a_List('Hamlet and Othello', movies_list))



# print(subtract_strings(questions_df[1]['query'],questions_df[1]['ner'][0]['word']))


{'res': 'Spider-Man: Far from Home', 'res_ind': 24369, 'score': 0.8333333333333334}
{'res': 'Béatrice Thiriet', 'res_ind': 2, 'score': 0.9375}
-1


In [17]:

def read_p_values_list():
    # Opening JSON file
    global P_values
    f = open('save_files/all_movie_properties_list.json')

    # returns JSON object as 
    # a dictionary
    all_movie_properties_list = json.load(f)


    for i,item in enumerate(all_movie_properties_list):
        all_movie_properties_list[i]['entity'] = subtract_strings(all_movie_properties_list[i]['entity'],'http://www.wikidata.org/entity/' )
        P_values[all_movie_properties_list[i]['label']] = all_movie_properties_list[i]['entity']
read_p_values_list()
print(len(P_values))


420


In [18]:
# Test
relation_list = list(P_values.keys())
final_relation_res = find_closest_match_in_a_List('MPA film rating', relation_list)
p_val = P_values[final_relation_res['res']]
rdflib.term.URIRef(WDT[p_val])

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1657')

In [19]:
# Test
find_entity_given_label('Forrest Gump', Q_values['film'])

rdflib.term.URIRef('http://www.wikidata.org/entity/Q134773')

# Processing question


In [20]:
test_questions = ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.", "Can you recommend me movies similar to Hamlet and Othello",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?", "Can you recommend me some horror films?", "Who is the director of Spider-Man: Far from Home?"]

In [21]:
def make_questions_df(sample_q):
    questions_df = [{"query": s, "type" : "", "entity":[]}for s in sample_q]
    
    return questions_df
test_questions_df = make_questions_df(test_questions)


## Pattern Matching

## Name Entity Recognition


In [22]:

def init_nlp_pipelines():
    global ner_pipeline, pos_tagging_pipe
    tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
    model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

    ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    tokenizer_POS = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
    model_POS = model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
    pos_tagging_pipe = pipeline("token-classification", model=model_POS, aggregation_strategy="simple", tokenizer = tokenizer_POS)
init_nlp_pipelines()

In [23]:
# Test
ner_pipeline(test_questions[10])

[{'entity_group': 'MISC',
  'score': 0.9420073,
  'word': 'Hamlet and Othello',
  'start': 38,
  'end': 57}]

In [24]:
# Find the type of question by keyword matching
# Can use calssifier for this
def find_type(formulated_question_df):
    keywords_images = [ 'image', 'picture', 'look', 'looks' ]
    keywords_recommendation = ['similar', 'recommend', 'recommendations']
    res_type = ''
    query_list = []
    for i in formulated_question_df['pos']:
        query_list.append(i['word']) 
    if any(word in query_list for word in keywords_images):
        res_type = "images"
    elif any(word in query_list for word in keywords_recommendation):
        res_type = "recommendation"
    else :
        res_type = "search"
            
    return res_type

# POS tagging for all of the questions
def run_NLP_on_question(questions_df):
    global ner_pipeline, pos_tagging_pipe
    for i in range (len(questions_df)):
        questions_df[i]["pos"] = pos_tagging_pipe(questions_df[i]["query"])
        questions_df[i]["ner"] = ner_pipeline(questions_df[i]["query"])
        for j,ner_res in enumerate(questions_df[i]["ner"]):
            if ner_res['score'] < 0.55:
#                 print('deleted ' + str (ner_res['word']))
                del questions_df[i]["ner"][j]
    
        # Add the type for all questions
        questions_df[i]['type'] = find_type(questions_df[i])
    
    return questions_df

# Test

test_questions_df = run_NLP_on_question(test_questions_df)                
test_questions_df[10]

{'query': 'Can you recommend me movies similar to Hamlet and Othello',
 'type': 'recommendation',
 'entity': [],
 'pos': [{'entity_group': 'AUX',
   'score': 0.99889153,
   'word': 'can',
   'start': 0,
   'end': 3},
  {'entity_group': 'PRON',
   'score': 0.9995004,
   'word': 'you',
   'start': 4,
   'end': 7},
  {'entity_group': 'VERB',
   'score': 0.9994892,
   'word': 'recommend',
   'start': 8,
   'end': 17},
  {'entity_group': 'PRON',
   'score': 0.9995322,
   'word': 'me',
   'start': 18,
   'end': 20},
  {'entity_group': 'NOUN',
   'score': 0.9981205,
   'word': 'movies',
   'start': 21,
   'end': 27},
  {'entity_group': 'ADJ',
   'score': 0.99660134,
   'word': 'similar',
   'start': 28,
   'end': 35},
  {'entity_group': 'ADP',
   'score': 0.9993135,
   'word': 'to',
   'start': 36,
   'end': 38},
  {'entity_group': 'PROPN',
   'score': 0.9973455,
   'word': 'hamlet',
   'start': 39,
   'end': 45},
  {'entity_group': 'CCONJ',
   'score': 0.9984086,
   'word': 'and',
   'start'

In [25]:
# find the entity for a question of type images
def get_entities_from_nlp_results(formulated_question_df):
    global ner_pipeline, pos_tagging_pipe, movies_list, humans_list, P_values
    res_entites = []
    
    # Look for Entity in NER results

    for i ,ner_res in enumerate(formulated_question_df['ner']):
        if ' and ' in ner_res['word'] and formulated_question_df['type'] in ['recommendation']:
            x = ner_res['word'].split(' and ')
            
            
            score = 0
            split_res = []
            for item in x:
                a = find_closest_match_in_a_List(item, movies_list)
                split_res.append(a)
                score = score + a['score']
            score = score/len(split_res)
            non_split_res = find_closest_match_in_a_List(ner_res['word'], movies_list)  
            
            if non_split_res == -1 or score > non_split_res['score']:
                # Add the split NER results to question
                start = ner_res['start'] 
                for i, itemx in enumerate(x):
                    
                    start = start + len(itemx) + len(' and ')
                    formulated_question_df['ner'].append({'entity_group': 'MISC',
                      'score': 0.99,
                      'word': itemx,
                      'start': start,
                      'end': start + len(itemx)})
                continue
        is_entity = False
        closest_match = {'res': '', 'res_ind': -1, 'score': 0}
        if ner_res['entity_group'] == 'MISC' or ner_res['entity_group'] == 'LOC' or ner_res['entity_group'] == 'ORG'or ner_res['entity_group'] == 'PER':
            
            # Find the best match in movies list
            
            movies_res = find_closest_match_in_a_List(str(ner_res['word']), movies_list)
            if movies_res != -1:                         
                if closest_match['score'] < movies_res['score']:
                    closest_match = movies_res
                    is_entity = True
            
            # Find the best match in humans list
            
            human_res = find_closest_match_in_a_List(str(ner_res['word']), humans_list)
            if human_res != -1:                              
                if closest_match['score'] < human_res['score']:
                    closest_match = human_res
                    is_entity = True
            
            # Find the best match in predicate list
            
            pred_res = find_closest_match_in_a_List(str(ner_res['word']), list(P_values.keys()))
            if pred_res != -1:                              
                if closest_match['score'] < pred_res['score']:
                    closest_match = pred_res
                    is_entity = False
            
                
        if is_entity and closest_match['score'] > 0.5:
            res_entites.append(ner_res['word'])
            
#             del formulated_question_df['ner'][i]
            is_entity = False            
    
    # Worst case scenario, no entities found
    if len(res_entites) == 0:
        
        potential_words = [ner_res['word'] for ner_res in formulated_question_df['ner'] ]
        for item in formulated_question_df['pos']:
            if item['entity_group'] in ['NOUN', 'PROPN']:
                potential_words.append(item['word'])
        
        # Type Search - means we are looking for a movie name 
        if formulated_question_df['type'] == 'search':
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the movie list
                movies_res = find_closest_match_in_a_List(str(pot_word), movies_list)
                if movies_res != -1:                         
                    if closest_match['score'] < movies_res['score']:
                        closest_match = movies_res
                        closest_match['res'] = pot_word
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
        
        # Type Image - means we are looking for a person
        elif formulated_question_df['type'] == 'image':
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the movie list
                movies_res = find_closest_match_in_a_List(str(pot_word), humans_list)
                if movies_res != -1:                         
                    if closest_match['score'] < movies_res['score']:
                        closest_match = movies_res
                        closest_match['res'] = pot_word
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
        
        elif formulated_question_df['type'] in ['recommendation', 'recommendation_genre']:
            genres = find_all_film_genres()
            genres_list = [str(x[1]) for x in genres]
            
            closest_match = {'res': '', 'res_ind': -1, 'score': 0}
            
            for pot_word in potential_words:
                # Look for the best match of all potential words in the genre list
                pot_word = pot_word.replace("movies", "")
                pot_word = pot_word.replace("Movies", "")
                pot_word = pot_word.replace("movie", "")
                pot_word = pot_word.replace("Movie", "")
                genre_res = find_closest_match_in_a_List(str(pot_word), genres_list)
                if genre_res != -1:                         
                    if closest_match['score'] < genre_res['score']:
                        closest_match = genre_res
                        
            
            
            if closest_match['score'] > 0:
                res_entites.append(closest_match['res'])
                formulated_question_df['type'] = 'recommendation_genre'
               
                
        
    
    # Handle entities split into 2
    
    if len(res_entites) > 1 and formulated_question_df['type'] != 'recommendation':
        concat = ''
        for item in res_entites:
            concat = concat + str(item)
        res_entites.append(concat)

        final_res = ''
        best_score = 0
        
#         print(res_entites)
        
        for i,res in enumerate(res_entites):
            temp = find_closest_match_in_a_List(res, movies_list)
            if temp != -1:
               # if scores are the similar, take the longer one
                if temp['score'] - best_score <= 0.08:
                    if len(temp['res']) > len(final_res):
                        final_res = res
                        
                        best_score = temp['score']  
                # if you find a better match for an entity, take that
                elif temp['score'] > best_score:
                    final_res = res
                    
                    best_score = temp['score'] 
                    
    
        
        formulated_question_df['entity'] = [final_res]
        
    else:
        
        formulated_question_df['entity'] = res_entites

    
    
    return formulated_question_df



for test_q_df in test_questions_df:
    test_q_df = get_entities_from_nlp_results(test_q_df)
    print(test_q_df['entity'])


['Good Will Hunting']
['The Bridge on the River Kwai']
['Star Wars: Episode VI - Return of the Jedi']
['The Masked Gang: Cyprus']
['Weathering with You?']
['Good Neighbors']
['Halle Berry']
['Julia Roberts']
['Sandra Bullock']
['Hamlet', 'Othello']
['Hamlet', 'Othello']
['The Lion King', 'Pocahontas', 'The Beauty and the Beast']
['Nightmare on Elm Street', 'Friday the 13th', 'Halloween']
['Tom Meets Zizou']
['X-Men: First Class']
['Batman']
['The Princess and the Frog']
['Christopher Nolan']
['horror film']
['Spider-Man: Far from Home']


In [26]:
def get_predicate_from_nlp(formulated_question):
    global P_values, pos_tagging_pipe
    string_res = str(formulated_question['query'])
    
    # process on all of the words that are not entity
    
    for entity_res in formulated_question['entity']:
        string_res = subtract_strings(string_res, str(entity_res))
        
        
#     print(string_res)
    pos_res = pos_tagging_pipe(string_res)
#     print(pos_res)
    # process on all of the words that are not the following
    
    other_pos_tags = ['PUNCT', 'ADP', 'DET', 'AUX', 'PRON']
    res_list = []
    for item in pos_res:
        if (item['entity_group']) not in other_pos_tags:
            res_list.append( string_res[item['start']:item['end']])
    concat = ''
    if len(res_list) > 1:
        for item in res_list:
            concat = concat + ' ' + str(item)
        res_list.append(concat)
#     print(res_list) 
    final_res = ''
    best_score = 0
    for i,res in enumerate(res_list):
        temp = find_closest_match_in_a_List(res, list(P_values.keys()))
        if temp != -1 :
            if temp['score'] - best_score <= 0.08:
                if len(temp['res']) > len(final_res):
                    final_res = temp['res']
                    best_score = temp['score']
            
            elif temp['score'] > best_score:
                final_res = temp['res']
                best_score = temp['score'] 
    
        
    return final_res
        
# Test

for i in range (len(test_questions_df)):
    if test_questions_df[i]['type'] == 'search':
        res = get_predicate_from_nlp(test_questions_df[i])
        print(res)


director
director
director
screenwriter
MPA film rating
genre
publication date
executive producer
director
box office
birth name
director


In [27]:
def get_key_of_dict(val, my_dict):
    for key, value in my_dict.items():
        if val == value:
            return key
 
    return -1

In [28]:
# Test
key = get_key_of_dict('P345', P_values)
key

'IMDB Id'

In [29]:
def init_images_json():
    global images_json
    f = open(os.path.join('..','dataset', 'movienet','images.json'))
    images_json = json.load(f)
init_images_json()

In [30]:
# main function to answer image questions

def handle_image_questions(formulated_question_df):
    
    global Q_values, movies_list, humans_list, images_json
    if formulated_question_df['type']!='images':
        return -1, -1

    # Can change to handle multiple entites but not necessary
    name = formulated_question_df['entity']
    if len(name)== 0:
        return -1, -1
    name = name[0]
    
    final_entity_name = '' 
    best_score = 0
    movie_res = find_closest_match_in_a_List(name, movies_list)
    if movie_res != -1 and movie_res['score'] > best_score:
        final_entity_name = movie_res['res']
        best_score = movie_res['score']
        entity_type = 'film'
    human_res = find_closest_match_in_a_List(name, humans_list)
    if human_res != -1 and human_res['score'] > best_score:
        final_entity_name = human_res['res']
        best_score = human_res['score']
        entity_type = 'human'


    if len(final_entity_name) == 0:
        final_entity_name = name
        entity_type = 'none'
        entity_URI = find_entity_given_label(final_entity_name, entity_type)
    else:
        entity_URI = find_entity_given_label(final_entity_name, Q_values[entity_type])

    # Return Image
    Imdb_ID, _ =  find_something_about_an_entity(entity_URI, WDT['P345'])
    Imdb_URI = IMDB[Imdb_ID] 
    
    if entity_type == 'film':
        img_key = 'movie'
    else:
        img_key = 'cast'
        
    image_to_return = '-1'
    
#     for data in images_json:
#         if str(Imdb_ID) in data[img_key]:
#             image_to_return = data['img']
    
    image_to_return = 'image:' + image_to_return
    return Imdb_URI, final_entity_name



for q in test_questions_df:
    if q['type']=='images':
        res = handle_image_questions(q)

        print(res)
        


(rdflib.term.URIRef('https://www.imdb.com/name/nm0000932'), 'Halle Berry')
(rdflib.term.URIRef('https://www.imdb.com/name/nm0000210'), 'Julia Roberts')
(rdflib.term.URIRef('https://www.imdb.com/name/nm0000113'), 'Sandra Bullock')


# Embeddings and Recommendations

In [31]:
def load_embeddings(graph):
    global entity_emb, relation_emb, id2ent, id2rel, label2ent, ent2id, ent2label, rel2id
    entity_emb = np.load(os.path.join('..', 'dataset','embeddings', 'entity_embeds.npy'))
    relation_emb = np.load(os.path.join('..', 'dataset','embeddings', 'relation_embeds.npy'))

    # load the dictionaries
    with open(os.path.join('..', 'dataset','embeddings', 'entity_ids.del'), 'r') as ifile:
        ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
        id2ent = {v: k for k, v in ent2id.items()}
    with open(os.path.join('..', 'dataset','embeddings', 'relation_ids.del'), 'r') as ifile:
        rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
        id2rel = {v: k for k, v in rel2id.items()}

    ent2label = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
    label2ent = {lbl: ent for ent, lbl in ent2label.items()}

def find_similar_entities_for_id(id, n = 4):
    global entity_emb, relation_emb, id2ent, id2rel, label2ent, ent2id, ent2label, rel2id
    entity = ent2id[WD[id]] 
    
    # compute distance with other entities
    dist = pairwise_distances(entity_emb[entity].reshape(1, -1), entity_emb).reshape(-1)
    best_matches = dist.argsort()

    best_matches = best_matches[1:20]
    best_matches = best_matches[1:n]
    
    res_list = []
    for i,idx in enumerate(best_matches):
        res_list.append(ent2label[id2ent[idx]])
        
    return res_list

In [32]:
global graph
load_embeddings(graph)

In [33]:
def find_similar_entities_for_multiple(entity_ids):
    global entity_emb, relation_emb, id2ent, id2rel, label2ent, ent2id, ent2label, rel2id
    
    entities = [ent2id[WD[id]] for id in entity_ids]
    
    entity_embs = np.array([entity_emb[entity].reshape(1, -1) for entity in entities ])
#     print((entity_embs.shape))
    sum_emb = entity_embs.sum(axis=0)
    # Calculate average of all of the entities
    avg_entity_embedding = sum_emb/entity_embs.shape[0]

 # Find the closest match to the average
    dist = pairwise_distances(avg_entity_embedding, entity_emb).reshape(-1)
    
    best_matches = dist.argsort()
    if len(best_matches)> 20:
        best_matches = best_matches[1:20]
    
    res_list = []
    for i,idx in enumerate(best_matches):
        res_list.append(ent2label[id2ent[idx]])
        
    return res_list

Q164963 - two tower
Q102225 - HP Goblet
Q1199283 - HP Half blood
Q131074 - return of kings
Q471169 - Harry met sally

In [34]:

def handle_recommendation_questions(formulated_question, num_recommendations = 4):
    
    final_res_names = []
    question_entities = formulated_question['entity']
    if len(question_entities) == 0:
        return -1
    if formulated_question['type'] == 'recommendation_genre':
        genre_name = question_entities[0]
        genres = find_all_film_genres()
        genres_list = [str(x[1]) for x in genres] 
        genre_res  = find_closest_match_in_a_List(genre_name, genres_list)
        genre_q_val = genres[genre_res['res_ind']]
        genre_q_val = genre_q_val[0][len(WD):]
        query_res = find_movies_of_genre_by_rating(genre_q_val, n=num_recommendations)
        if query_res == -1:
            return -1
        final_res_names = [str(res[1]) for res in query_res]
        
        
    else:

        movie_names = [movie for movie in question_entities ]

        for i,movie_name in enumerate(movie_names):

            res = find_closest_match_in_a_List(str(movie_name), movies_list)

            if res != -1:
                movie_names[i] = res['res']

        entity_names = [ find_entity_given_label(movie_name, 'film') for movie_name in movie_names]


        entity_q_list = [subtract_strings(entity, 'http://www.wikidata.org/entity/') for entity in entity_names]

        recommendation_label_list = find_similar_entities_for_multiple(entity_q_list)

        for item in recommendation_label_list:
            if item not in movie_names:
                final_res_names.append(item)
    
    if len(final_res_names) > num_recommendations:
        final_res_names = final_res_names[:num_recommendations]
    
    return final_res_names
    
for q in test_questions_df:
    if q['type'] in ['recommendation', 'recommendation_genre']:
        res = handle_recommendation_questions(q)
        print(q['query'])
        print(res)

Recommend movies similar to Hamlet and Othello.
['A Room with a View', 'Sense and Sensibility', 'The Tempest', 'Dancing at Lughnasa']
Can you recommend me movies similar to Hamlet and Othello
['A Room with a View', 'Sense and Sensibility', 'The Tempest', 'Dancing at Lughnasa']
Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?
['Tarzan', 'Treasure Planet', 'Moana', 'Aladdin']
Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.
['Texas Chainsaw 3D', 'Final Destination 3', 'The First Purge', 'The Texas Chainsaw Massacre']
Can you recommend me some horror films?
['The Shining', 'The Thing', "Rosemary's Baby", 'Donnie Darko']


In [35]:

def check_embeddings_for_errors(entity,relation,query_res = -1):
    # "Jean Van Hamme" entity
    if WD[entity] in ent2id.keys():
        head = entity_emb[ent2id[WD[entity]]]
    else:
        return False, False
    # "occupation" relation
    if WDT[relation]in rel2id.keys():
        pred = relation_emb[rel2id[WDT[relation]]]
    else:
        return False, False
    
    # add vectors according to TransE scoring function.
    lhs = head + pred
    # compute distance to *any* entity
    dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
    # find most plausible entities
    most_likely = dist.argsort()
    # compute ranks of entities
    ranks = dist.argsort().argsort()
    if query_res != -1:
        res = pd.DataFrame([(str(lbl), dist[ent2id[WD[ent]]], ranks[ent2id[WD[ent]]]) for ent, lbl in query_res],
            columns=('Occupation', 'Score', 'Rank'))
    else:
        res = -1
    most_likely = pd.DataFrame([
        (id2ent[idx][len(WD):], ent2label[id2ent[idx]], dist[idx], rank+1)
        for rank, idx in enumerate(most_likely[:10])],
        columns=('Entity', 'Label', 'Score', 'Rank'))
    return res, most_likely

In [36]:
# Test
res, most_likely = check_embeddings_for_errors('Q181803', 'P57',{('Q471402','Richard Marquand')})
most_likely

Unnamed: 0,Entity,Label,Score,Rank
0,Q311319,Frank Oz,2904.539551,1
1,Q38222,George Lucas,2919.880371,2
2,Q240872,Lawrence Kasdan,2987.016113,3
3,Q1150882,Duwayne Dunham,2995.905762,4
4,Q3399754,Richard Driscoll,2996.198486,5
5,Q471402,Richard Marquand,3043.516602,6
6,Q203960,James Earl Jones,3047.783203,7
7,Q3020806,Debbie Lee Carrington,3059.332031,8
8,Q6848487,Mike Quinn,3061.999756,9
9,Q128379,David Prowse,3068.833496,10


## Crowd Source data

In [37]:
global crowd_df
crowd_df = pd.read_csv((os.path.join('..', 'dataset','crowd_data', 'crowd_data.tsv')),sep='\t')
crowd_df['LifetimeApprovalRate'] = crowd_df['LifetimeApprovalRate'].str.rstrip("%").astype(int)
crowd_df.head()


Unnamed: 0,HITId,HITTypeId,Title,Reward,AssignmentId,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
0,1,7QT,Is this triple correct or incorrect?,$0.50,1,2133ICYWE97,Submitted,60,99,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,Is this triple correct or incorrect?,$0.50,2,2133U7HKDLO,Submitted,40,40,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
2,1,7QT,Is this triple correct or incorrect?,$0.50,3,928UJANWZ12,Submitted,50,98,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
3,1,7QT,Is this triple correct or incorrect?,$0.50,4,1726JMZQW,Submitted,80,70,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
4,1,7QT,Is this triple correct or incorrect?,$0.50,5,2134U7HKDMM,Submitted,2,70,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,


In [38]:
print(crowd_df.shape)
def get_rid_of_malicious_workers():
    for index, row in crowd_df.iterrows(): 
        if ((row['LifetimeApprovalRate'] < 50 ) or (row['WorkTimeInSeconds'] < 5) ) :
            crowd_df.drop(index, inplace=True)

get_rid_of_malicious_workers()
print(crowd_df.shape)


(305, 16)
(203, 16)


In [39]:
global distinct_question_pairs
q="""SELECT DISTINCT Input1ID, Input2ID, Input3ID, HITTypeId FROM crowd_df;"""
pysqldf = lambda q: sqldf(q, globals())
distinct_question_pairs = pysqldf(q)
len(distinct_question_pairs)

60

wd:Q1339195	ddis:indirectSubclassOf	wd:Q27096213

In [40]:
def get_val_pos_for_distinct_questions():
    resolution_pos_val = []
    global crowd_df, distinct_questions_pair
    for index, row in distinct_question_pairs.iterrows():
        entity = row['Input1ID']
        relation = row['Input2ID']
        count = 0 
        
        for ind_main, row_main in crowd_df.iterrows():
            if (row_main['Input1ID'] == row['Input1ID']) and (row_main['Input2ID'] == row['Input2ID']) and (row_main['AnswerID'] == 2.0):
                
                if row_main['FixPosition'] in ['Object', 'Subject', 'Predicate'] and row_main['FixValue'] is not np.nan:
                    count = count + 1
                    resolution_pos_val.append((index, row_main['FixPosition'], row_main['FixValue']))
    unique_correction = list(set(resolution_pos_val))

    unique_correction.sort(key = lambda x: x[0])
    
    for i, item in enumerate(unique_correction):
        count = resolution_pos_val.count(item)
        unique_correction[i] = (item, count)

    return unique_correction
print(get_val_pos_for_distinct_questions()) 


[((2, 'Object', '2014-02-18'), 1), ((5, 'Object', '2019-02-24'), 1), ((7, 'Object', '698491348'), 1), ((10, 'Object', 'wd:Q72'), 1), ((12, 'Object', 'wd:Q94074'), 1), ((16, 'Object', 'wd:Q95073'), 1), ((20, 'Object', '2010-01-01'), 2), ((21, 'Object', '2011-01-01'), 3), ((22, 'Object', '176997168'), 2), ((23, 'Object', '1992-01-01'), 1), ((24, 'Object', '2011-01-01'), 3), ((25, 'Object', '863756051'), 2), ((27, 'Object', 'Q5423258'), 3), ((29, 'Object', '10696210'), 1), ((30, 'Subject', 'Q1168152'), 2), ((31, 'Object', '2015-08-27'), 1), ((32, 'Subject', 'Q908556'), 3), ((33, 'Subject', 'Q1471671'), 2), ((34, 'Subject', 'Q1722254'), 2), ((35, 'Subject', 'Q15052538'), 1), ((36, 'Subject', 'Q409022'), 2), ((37, 'Subject', 'Q27703272'), 1), ((38, 'Predicate', 'P106'), 2), ((39, 'Predicate', 'P161'), 2), ((40, 'Predicate', 'P344'), 2), ((41, 'Predicate', 'P58'), 3), ((42, 'Object', 'Q17350908'), 3), ((43, 'Object', 'Q3194791'), 2), ((44, 'Object', 'Q28732985'), 2), ((45, 'Predicate', 'P58'

In [41]:
def calculate_agreeability_in_crowd():
    global distinct_question_pairs
    score_list = []
    cor_count_list = []
    inc_count_list = []
    batches = distinct_question_pairs['HITTypeId'].unique()
    for index, row in distinct_question_pairs.iterrows():
        entity = row['Input1ID']
        relation = row['Input2ID']
        cor_count = 0
        inc_count = 0
        entiy_rel_rows = crowd_df.loc[(crowd_df['Input1ID'] == entity) & (crowd_df['Input2ID'] == relation)]
        for index, rows in entiy_rel_rows.iterrows():
            if rows['AnswerID'] == 2.0:
                inc_count += 1
            elif rows['AnswerID'] == 1.0:
                cor_count += 1
        score = cor_count/(cor_count + inc_count)
        score_list.append(score)
        cor_count_list.append(cor_count)
        inc_count_list.append(inc_count)
        
        
#         print("Score for " + str(entity) + " and " + str(relation) + " = " + str(score))

    distinct_question_pairs['Score'] = score_list
    distinct_question_pairs['NumCorrect'] = cor_count_list
    distinct_question_pairs['NumIncorrect'] = inc_count_list
    kappa_list = []
    for batch in batches:
        p_e = 0
        p_o = 0
        num_q = 0
        sum_corr = 0
        sum_inc = 0
        num_raters = 0
        for index, row in distinct_question_pairs.iterrows():
             
            if str(row['HITTypeId']) == str(batch):
                num_raters = row['NumCorrect'] + row['NumIncorrect']
                num_q = num_q +1
                sum_corr = sum_corr + row['NumCorrect']
                sum_inc = sum_corr + row['NumIncorrect']
                p_o = p_o + np.power(row['NumCorrect'], 2) + np.power(row['NumIncorrect'], 2)
                
        total = sum_corr + sum_inc
        sum_corr = sum_corr/total
        sum_inc = sum_inc/total
        p_e = np.power(sum_corr, 2) + np.power(sum_inc, 2)
        p_o = (p_o - (num_q*num_raters))/(num_q*num_raters*(num_raters-1))
        kappa = (p_o - p_e)/(1-p_e)
        kappa_list = kappa_list + [kappa]*num_q
    distinct_question_pairs['Kappa'] = kappa_list
calculate_agreeability_in_crowd() 
distinct_question_pairs

Unnamed: 0,Input1ID,Input2ID,Input3ID,HITTypeId,Score,NumCorrect,NumIncorrect,Kappa
0,wd:Q11621,wdt:P2142,792910554,7QT,0.666667,2,1,0.649697
1,wd:Q603545,wdt:P2142,4300000,7QT,1.0,3,0,0.649697
2,wd:Q16911843,wdt:P577,2014-01-18,7QT,0.666667,2,1,0.649697
3,wd:Q132863,wdt:P2142,969023261,7QT,0.0,0,3,0.649697
4,wd:Q1628022,wdt:P577,1951-01-01,7QT,1.0,3,0,0.649697
5,wd:Q48313910,wdt:P577,2018-02-24,7QT,0.333333,1,2,0.649697
6,wd:Q171300,wdt:P2142,267000000,7QT,0.666667,2,1,0.649697
7,wd:Q217010,wdt:P2142,698491347,7QT,0.0,0,3,0.649697
8,wd:Q4993462,wdt:P577,2008-08-29,7QT,0.0,0,3,0.649697
9,wd:Q7033842,wdt:P577,1996-06-01,7QT,0.333333,1,2,0.649697


In [42]:
distinct_question_pairs['HITTypeId'].unique()

array(['7QT', '8QT', '9QT'], dtype=object)

In [43]:
# find_something_about_an_entity(entity_URI, relation_URI)
def find_graph_answers_to_crowd_data():
    global distinct_question_pairs
    query_res_list = []
    for index, row in distinct_question_pairs.iterrows():
        # Proces all crowd data where the workers label the 
        
        entity = row['Input1ID']
        relation = row['Input2ID']
        crowd_res = row['Input3ID']
        query_res = find_something_about_an_entity(WD[entity[3:]], WDT[relation[4:]])
        if query_res[0] == -1 or query_res[0] == None:
            query_res_list.append('-1')
        elif relation in ['wdt:P577', 'wdt:P2142']:
            query_res_list.append(str(query_res[0]))
        else:
            query_res_list.append(str(query_res[0][len(WD):]))
    
        
    distinct_question_pairs['QueryRes'] = query_res_list        
        
find_graph_answers_to_crowd_data()
distinct_question_pairs

Unnamed: 0,Input1ID,Input2ID,Input3ID,HITTypeId,Score,NumCorrect,NumIncorrect,Kappa,QueryRes
0,wd:Q11621,wdt:P2142,792910554,7QT,0.666667,2,1,0.649697,-1
1,wd:Q603545,wdt:P2142,4300000,7QT,1.0,3,0,0.649697,-1
2,wd:Q16911843,wdt:P577,2014-01-18,7QT,0.666667,2,1,0.649697,-1
3,wd:Q132863,wdt:P2142,969023261,7QT,0.0,0,3,0.649697,-1
4,wd:Q1628022,wdt:P577,1951-01-01,7QT,1.0,3,0,0.649697,-1
5,wd:Q48313910,wdt:P577,2018-02-24,7QT,0.333333,1,2,0.649697,-1
6,wd:Q171300,wdt:P2142,267000000,7QT,0.666667,2,1,0.649697,-1
7,wd:Q217010,wdt:P2142,698491347,7QT,0.0,0,3,0.649697,-1
8,wd:Q4993462,wdt:P577,2008-08-29,7QT,0.0,0,3,0.649697,-1
9,wd:Q7033842,wdt:P577,1996-06-01,7QT,0.333333,1,2,0.649697,-1


[((2, 'Object', '2014-02-18'), 1), ((5, 'Object', '2019-02-24'), 1), ((7, 'Object', '698491348'), 1), ((10, 'Object', 'wd:Q72'), 1), ((12, 'Object', 'wd:Q94074'), 1), ((16, 'Object', 'wd:Q95073'), 1), ((20, 'Object', '2010-01-01'), 2), ((21, 'Object', '2011-01-01'), 3), ((22, 'Object', '176997168'), 2), ((23, 'Object', '1992-01-01'), 1), ((24, 'Object', '2011-01-01'), 3), ((25, 'Object', '863756051'), 2), ((27, 'Object', 'Q5423258'), 3), ((29, 'Object', '10696210'), 1), ((30, 'Subject', 'Q1168152'), 2), ((31, 'Object', '2015-08-27'), 1), ((32, 'Subject', 'Q908556'), 3), ((33, 'Subject', 'Q1471671'), 2), ((34, 'Subject', 'Q1722254'), 2), ((35, 'Subject', 'Q15052538'), 1), ((36, 'Subject', 'Q409022'), 2), ((37, 'Subject', 'Q27703272'), 1), ((38, 'Predicate', 'P106'), 2), ((39, 'Predicate', 'P161'), 2), ((40, 'Predicate', 'P344'), 2), ((41, 'Predicate', 'P58'), 3), ((42, 'Object', 'Q17350908'), 3), ((43, 'Object', 'Q3194791'), 2), ((44, 'Object', 'Q28732985'), 2), ((45, 'Predicate', 'P58'), 3), ((46, 'Subject', 'Q268905'), 1), ((47, 'Subject', 'Q16795448'), 1), ((52, 'Predicate', 'P19'), 1), ((53, 'Predicate', 'P17'), 1), ((54, 'Predicate', 'P19'), 1), ((55, 'Object', 'Q181900'), 1), ((56, 'Object', 'Q7360827'), 1), ((57, 'Object', 'Q7488442'), 1), ((58, 'Object', 'Q884'), 1), ((59, 'Object', 'Q1860'), 1)]

In [44]:
# Add tuples to main graph depending on if they are literal or not
def add_tuple_to_graph(ip_tuple):
    global graph
    
    sub = ip_tuple[0][3:]
    pred = ip_tuple[1][4:]
    
    if 'wd:' in str(ip_tuple[2]):
        obj = ip_tuple[2][3:]
        graph.add((WD[sub], WDT[pred], WD[obj]))
        return (WD[sub], WDT[pred], WD[obj])
    else:
        obj = str(ip_tuple[2])
        graph.add((WD[sub], WDT[pred], Literal(obj))) 
        return (WD[sub], WDT[pred], Literal(obj))
    
def remove_tuple_from_graph(ip_tuple):
    global graph
    sub = None
    pred = None
    obj = None
    if ip_tuple[0] != None:
        sub = WD[ip_tuple[0][3:]]
    if ip_tuple[1] != None:
        pred = WDT[ip_tuple[1][4:]]
    
    if ip_tuple[2] != None:
        if 'wd:' in str(ip_tuple[2]):
            obj = ip_tuple[2][3:]

        else:
            obj = str(ip_tuple[2])
    graph.remove((sub,pred,obj))
    



In [45]:
entity = 'wd:Q1410031'
relation = 'wdt:P577'
print(find_something_about_an_entity(WD[entity[3:]], WDT[relation[4:]]))


(rdflib.term.Literal('2010-10-01', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#date')), None)


In [46]:
global final_crowd_edits
def fix_graph_with_crowd():
    global graph, distinct_question_pairs, final_crowd_edits
    count = 0
    pos_val_tuples = get_val_pos_for_distinct_questions()
    final_crowd_edits = []
    for index, row in distinct_question_pairs.iterrows():
        # Update graph to add SPO that are missing 
        if str(row['QueryRes']) in ['-1', '', 'None']:
            # Add missing where majority workers said the Task SPO was correct
            if row['Score'] > 0.6 :
                # Add to graph
                print('Add to graph : '+ '(' + str(row['Input1ID']) + ','+ str(row['Input2ID']) + ','+ str(row['Input3ID']) + ')')
                
                add_tuple_to_graph((str(row['Input1ID']),str(row['Input2ID']),str(row['Input3ID']) ))
                final_crowd_edits.append((str(row['Input1ID']),str(row['Input2ID']), row['Kappa'], row['NumCorrect'], row['NumIncorrect']))
            
            
            # Add missing values where majority said the Task SPO was incorrect
        for item in pos_val_tuples:
            if item[0][0] == index and row['Score'] < 0.3 and item[1] >= 2:
        
                # Process Object tuples
                if str(item[0][1]) == 'Object':
                    graph_val =  row['QueryRes']
                    # If the graph value is missing, just add it
                    if str(graph_val) in ['-1', 'None', '']:
                        print('Add missing Object to graph - '+str(item) + ' graph = ' + str(graph_val) + ' ' + str(item[0][2]))
                        add_tuple_to_graph( ( str(row['Input1ID']),str(row['Input2ID']),str(item[0][2]) ) )
                        final_crowd_edits.append((str(row['Input1ID']),str(row['Input2ID']) , row['Kappa'], row['NumCorrect'], row['NumIncorrect']))
                        
                    # If Graph has a different value, update it
                    elif str(graph_val) != str(item[0][2]):
                        print('Update graph to - '+str(item)+ ' org val = '+str(graph_val))
                        remove_tuple_from_graph( (str(row['Input1ID']) , str(row['Input2ID']),  None) )
                        add_tuple_to_graph( ( str(row['Input1ID']),str(row['Input2ID']),str(item[0][2]) ) )
                        final_crowd_edits.append((str(row['Input1ID']),str(row['Input2ID']), row['Kappa'], row['NumCorrect'], row['NumIncorrect']))
                        
                elif str(item[0][1]) == 'Predicate':
                    graph_query = find_something_about_an_entity(WD[row['Input1ID']], WDT[item[0][2]])
                    # If suggested tuple is missing in Graph, add it
                    if graph_query[0] == -1:
                        print('Add missing Predicate to graph ' + str(item))
                        remove_tuple_from_graph( ( str(row['Input1ID']), None, str(row['Input3ID']) ) )
                        add_tuple_to_graph((str(row['Input1ID']),str(item[0][2]), str(row['Input3ID']) ))
                        final_crowd_edits.append((str(row['Input1ID']),str(item[0][2]), row['Kappa'], row['NumCorrect'], row['NumIncorrect']))
                        
                        
                elif str(item[0][1]) == 'Subject':
                    graph_query = find_something_about_an_entity(WD[str(item[0][2])], WDT[row['Input2ID']])
                    if graph_query[0] == -1:
                        print('Add missing Subject to graph ' + str(item))
                        add_tuple_to_graph((str(item[0][2]), str(row['Input2ID']), str(row['Input3ID']) ))
                        final_crowd_edits.append( (str(item[0][2]), str(row['Input2ID']) , row['Kappa'],row['NumCorrect'], row['NumIncorrect'])) 
                                                 
                    elif str(graph_query[0][len(WD):]) != row['Input3ID'] :
                        print('Update Graph Subject '+ str(item))
                        remove_tuple_from_graph( (None , str(row['Input2ID']),  str(row['Input3ID']) ) )
                        add_tuple_to_graph((str(item[0][2]), str(row['Input2ID']), str(row['Input3ID']) ))
                        final_crowd_edits.append((str(item[0][2]), str(row['Input2ID']) , row['Kappa'] , row['NumCorrect'], row['NumIncorrect']))
                        
                
                
                # Find correct answer in 
    
global graph
print(len(graph))        
fix_graph_with_crowd()
print(len(graph)) 
# org len 2056777
# edit len 2056763

2056777
Add to graph : (wd:Q11621,wdt:P2142,792910554)
Add to graph : (wd:Q603545,wdt:P2142,4300000)
Add to graph : (wd:Q16911843,wdt:P577,2014-01-18)
Add to graph : (wd:Q1628022,wdt:P577,1951-01-01)
Add to graph : (wd:Q171300,wdt:P2142,267000000)
Add to graph : (wd:Q1339195,ddis:indirectSubclassOf,wd:Q27096213)
Add to graph : (wd:Q104649845,ddis:indirectSubclassOf,wd:Q43229)
Add to graph : (wd:Q61928601,ddis:indirectSubclassOf,wd:Q95074)
Add to graph : (wd:Q8275050,ddis:indirectSubclassOf,wd:Q47461344)
Add to graph : (wd:Q17710986,ddis:indirectSubclassOf,wd:Q7725310)
Update graph to - ((21, 'Object', '2011-01-01'), 3) org val = 2001-01-01
Update graph to - ((24, 'Object', '2011-01-01'), 3) org val = 2010-10-01
Update graph to - ((27, 'Object', 'Q5423258'), 3) org val = Q1149489
Add missing Subject to graph ((32, 'Subject', 'Q908556'), 3)
Add to graph : (wd:Q814781,wdt:.P344,wd:Q40087803)
Add missing Predicate to graph ((40, 'Predicate', 'P344'), 2)
Add missing Predicate to graph ((41,

In [47]:
final_crowd_edits

[('wd:Q11621', 'wdt:P2142', 0.6496969696969697, 2, 1),
 ('wd:Q603545', 'wdt:P2142', 0.6496969696969697, 3, 0),
 ('wd:Q16911843', 'wdt:P577', 0.6496969696969697, 2, 1),
 ('wd:Q1628022', 'wdt:P577', 0.6496969696969697, 3, 0),
 ('wd:Q171300', 'wdt:P2142', 0.6496969696969697, 2, 1),
 ('wd:Q1339195', 'ddis:indirectSubclassOf', 0.6496969696969697, 2, 1),
 ('wd:Q104649845', 'ddis:indirectSubclassOf', 0.6496969696969697, 6, 0),
 ('wd:Q61928601', 'ddis:indirectSubclassOf', 0.6496969696969697, 2, 1),
 ('wd:Q8275050', 'ddis:indirectSubclassOf', 0.6496969696969697, 3, 0),
 ('wd:Q17710986', 'ddis:indirectSubclassOf', 0.6496969696969697, 2, 1),
 ('wd:Q598752', 'wdt:P577', 0.06528395061728387, 0, 3),
 ('wd:Q1410031', 'wdt:P577', 0.06528395061728387, 0, 3),
 ('wd:Q639070', 'wdt:P161', 0.06528395061728387, 0, 3),
 ('Q908556', 'wdt:P161', 0.06528395061728387, 0, 3),
 ('wd:Q814781', 'wdt:.P344', 0.06528395061728387, 2, 1),
 ('wd:Q1780602', 'P344', 0.06590413943355117, 1, 3),
 ('wd:Q610633', 'P58', 0.0659

In [48]:
print(find_something_about_an_entity(WD['Q598752'], WDT['P577']))
print(find_something_about_an_entity(WD['Q11621'], WDT['P2142']) )
print(find_something_about_an_entity(WD['Q1410031'], WDT['P577']) )
print(find_something_about_an_entity(WD['Q1628022'], WDT['P577']) )
print(find_something_about_an_entity(WD['Q16911843'], WDT['P577']) )
print(find_something_about_an_entity(WD['Q31202708'], WDT['P58']) )

(rdflib.term.Literal('2011-01-01'), None)
(rdflib.term.Literal('792910554'), None)
(rdflib.term.Literal('2011-01-01'), None)
(rdflib.term.Literal('1951-01-01'), None)
(rdflib.term.Literal('2014-01-18'), None)
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q15808338'), rdflib.term.Literal('Eric Warren Singer', lang='en'))


In [49]:
URI, label = (rdflib.term.Literal('1951-01-01'), None)
URI, label = (rdflib.term.URIRef('http://www.wikidata.org/entity/Q15808338'), rdflib.term.Literal('Eric Warren Singer', lang='en'))
if type(URI) == Literal:
    print('ye')

In [50]:
def deal_with_KG_query(formulated_question):
    
    global final_crowd_edits, movies_list, humans_list, Q_values, P_values
    crowdres = (-1, -1, -1)
    if formulated_question['type'] != 'search':
        return -1, -1, -1, -1, -1, -1, -1,crowdres
    entity_type = ''
    entity = ''
    # Process Entity 
        
    if len(formulated_question['entity']) > 0:
        entity = formulated_question['entity'][0]
        
        closest_match = {'res': '', 'res_ind': -1, 'score': 0} 
        
        movies_res = find_closest_match_in_a_List(str(entity), movies_list)
        if movies_res != -1:                         
            if closest_match['score'] < movies_res['score']:
                closest_match = movies_res
                entity_type = 'film'

        human_res = find_closest_match_in_a_List(str(entity), humans_list)
        if human_res != -1:                              
            if closest_match['score'] < human_res['score']:
                closest_match = human_res
                entity_type = 'human'
        
        entity = closest_match['res']


    if entity_type == 'human' or entity_type == 'film':
        entity_URI = find_entity_given_label(entity, Q_values[entity_type])
    else:
        entity_URI = find_entity_given_label(entity, 'none')

    # Process Relation 
    relation = get_predicate_from_nlp(formulated_question)

    if relation == -1:
        return -1, -1, -1, -1, -1, -1, -1, crowdres

    if relation in list(P_values.keys()):
        p_val = P_values[relation]
        relation_URI = rdflib.term.URIRef(WDT[p_val])
    else:
        relation_URI = find_entity_given_label(relation)
        if relation_URI == -1:
            return -1, -1, -1, -1, -1, -1, -1, crowdres

    query_res_URI, query_res_label =  find_something_about_an_entity(entity_URI, relation_URI)
    if type(query_res_URI)== Literal:
        query_res_label = query_res_URI
    
    
    
    # Check Crowd Data for the question
    if entity_URI == -1 or relation_URI == -1:
        return -1, -1, -1, -1, -1, -1, -1, crowdres

    for item in final_crowd_edits:
        # If the question is in final crowd edits, 
        if str(item[0][3:]) == str(entity_URI[len(WD):]) and str(item[1][4:]) == str(relation_URI[len(WDT):]):
            crowdres = (item[2], item[3], item[4])
            
            
    
    # Check Embeddings for the question
    if query_res_URI != -1:
        query_res_Q_val = query_res_URI[len(WD):]
        emb_res, emb_most_likely = check_embeddings_for_errors(entity_URI[len(WD):], relation_URI[len(WDT):],{(query_res_Q_val,query_res_label)})
    else:
        emb_res, emb_most_likely = check_embeddings_for_errors(entity_URI[len(WD):], relation_URI[len(WDT):],{('Q329737', 'butcher')})
        
        # If Embeddings exist for the query
        if not isinstance(emb_res, bool):
            return -1, -1, WD[emb_most_likely.loc[:,"Entity"].values[0]], emb_most_likely.loc[:,"Label"].values[0], 1, entity, relation, crowdres
    
    
            
    
    # If entity or relation are not found in embeddings
    if isinstance(emb_res, bool):
        # Default return
        return query_res_URI, query_res_label, -1, -1, -1, entity, relation, crowdres
    
    rank = emb_res.loc[:,"Rank"].values[0]

    return  query_res_URI, query_res_label, WD[emb_most_likely.loc[:,"Entity"].values[0]], emb_most_likely.loc[:,"Label"].values[0], rank, entity, relation, crowdres

        

    



    
for test_q_df in test_questions_df:
# test_q_df = questions_df[12]
    if test_q_df['type'] == 'search':
# print(test_q_df['entity'])
        print(test_q_df['query'])
        res = deal_with_KG_query(test_q_df)
        if res[4] > 5:
            print('the result acc to KG is ' + str(res[1]) + ' .But it has a rank of ' + str(res[4]) + ' according to embeddings. Acc to emb the res is '+ str(res[3]))
        elif res[0] == -1:
            print('The result was not found in the KG but the approximate answer according to mebeddings is '+ str(res[3]))
            
        else:
            print('the result is '+ str(res[1]))
          


Who is the director of Good Will Hunting?
the result is Gus Van Sant
Who directed The Bridge on the River Kwai?
the result is David Lean
Who is the director of Star Wars: Episode VI - Return of the Jedi?
the result is Richard Marquand
Who is the screenwriter of The Masked Gang: Cyprus?
The result was not found in the KG but the approximate answer according to mebeddings is Cengiz Küçükayvaz
What is the MPAA film rating of Weathering with You?
the result acc to KG is NC-17 .But it has a rank of 6 according to embeddings. Acc to emb the res is PG-13
What is the genre of Good Neighbors?
the result acc to KG is art film .But it has a rank of 33 according to embeddings. Acc to emb the res is drama
Can you tell me the publication date of Tom Meets Zizou?
the result is 2011-01-01
Who is the executive producer of X-Men: First Class?
the result acc to KG is Sheryl Lee Ralph .But it has a rank of 27 according to embeddings. Acc to emb the res is Ashley Miller
Who is the director of Batman 1989?


## Chat Bot Answers

In [57]:
def get_chat_answer(message):
    question = make_questions_df(message)

    question = run_NLP_on_question(question)
    question = question[0]
    question = get_entities_from_nlp_results(question)
    result = []
#     print(question['type'])
    print(question['entity'])
    if question['type'] == 'images':
        result = handle_image_questions(question)
        if result[0] !=-1:
            final_message = 'You can find what ' + str(result[1]) + ' looks like and more at : ' + str(result[0]) + ' '
        else:
            final_message = 'Sorry! I could not find what you are looking for. Could you please rephrase your question?'
        
    elif question['type'] in ['recommendation', 'recommendation_genre']:
        result = handle_recommendation_questions(question, num_recommendations = 4)
        if result == -1:
            final_message = 'Sorry! I could not find what you are looking for. Could you please rephrase your question?'
        elif len(result) >= 3:
            n = len(result)
            final_message = 'I would recommend you : '
            for x in result[:-2]:
                final_message  = final_message + str(x) + ', '
            final_message = final_message + str(result[len(result)-2])
            final_message = final_message + ' and ' + str(result[len(result)-1])
        elif len(result) < 3 and len(result)>0:
            final_message = 'I would recommend you : ' + str(result[0]) + ' and ' + str(result[1])
        elif len(result) ==1 :
            final_message = 'I would recommend you : ' + str(result[0])
        else:
            final_message = 'Sorry! I could not find what you are looking for. Could you please rephrase your question?'
            
    if question['type'] == 'search':
        result = deal_with_KG_query(question)
        
        if result[7][0]!=-1:
            final_message = 'The ' + str(result[6]) + ' of '+ str(result[5]) +' is '+ str(result[1]) + ' accodring to the crowd sourced data with an inter-rater Fleiss Kappa rating of ' + str(result[7][0])[:5] + '. The answer distribution for this specific task was ' + str(result[7][1]) + ' support votes and ' + str(result[7][2]) + ' reject votes.'
        elif result[4] > 5:
            final_message = 'The ' + str(result[6]) + ' of '+ str(result[5]) +' is '+ str(result[1]) + ' according to my knowledge. But it could be wrong since it has a rank of ' + str(result[4]) + ' according to embeddings. According to embeddings, the most probable answer is '+ str(result[3])
        elif result[0] == -1 and result[3] !=-1:
            final_message = 'The ' + str(result[6]) + ' of '+ str(result[5]) + ' was not found in my knowledge but the approximate answer according to mebeddings is '+ str(result[3])

        elif result[0] != -1:
            final_message = 'The ' + str(result[6]) + ' of '+ str(result[5]) +' is '+ str(result[1])
        elif result[6] != -1 and result[5] !=-1:
            final_message = 'Sorry! I could not find ' + str(result[6]) + ' of '+ str(result[5]) +'. Could you please rephrase the question?'
        else:
            final_message = 'Sorry! I could not find what you are looking for. Could you please rephrase your question?'
    return final_message

# Test
ms =  ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.", "Can you recommend me movies similar to Hamlet and Othello",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?", "Can you recommend me some horror films?", "Who is the director of Spider-Man: Far from Home?"]
# for m in ms:
#     m = [m]
#     res_text = get_chat_answer(m)
#     print(res_text)
m = ['Show']

The director of Good Will Hunting is Gus Van Sant
The director of The Bridge on the River Kwai is David Lean
The director of Star Wars: Episode VI – Return of the Jedi is Richard Marquand
The screenwriter of The Masked Gang: Cyprus was not found in my knowledge but the approximate answer according to mebeddings is Cengiz Küçükayvaz
The MPA film rating of Weathering with You is NC-17 according to my knowledge. But it could be wrong since it has a rank of 6 according to embeddings. According to embeddings, the most probable answer is PG-13
The genre of Good Neighbors is art film according to my knowledge. But it could be wrong since it has a rank of 33 according to embeddings. According to embeddings, the most probable answer is drama
You can find what Halle Berry looks like and more at : https://www.imdb.com/name/nm0000932 
You can find what Julia Roberts looks like and more at : https://www.imdb.com/name/nm0000210 
You can find what Sandra Bullock looks like and more at : https://www.i

## SpeakEasy Bot Code

In [58]:

start_msg = """
        Try asking me a question. Here are some suggestions: \n
        1) Ask me about movies: characters, genre, director, screenwriter, producer etc \n
        2) Ask me about the people in the film industry: their images, occupation, country of citizenship, siblings etc  \n
        3) Recommendations for movies. \n
        """

In [59]:
import time
import atexit
import getpass
import requests  # install the package via "pip install requests"
from collections import defaultdict


# url of the speakeasy server
url = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 3


class DemoBot:
    def __init__(self, username, password):
        self.agent_details = self.login(username, password)
        self.session_token = self.agent_details['sessionToken']
        self.chat_state = defaultdict(lambda: {'messages': defaultdict(dict), 'initiated': False, 'my_alias': None})
        atexit.register(self.logout)

    def listen(self):
        while True:
            # check for all chatrooms
            current_rooms = self.check_rooms(session_token=self.session_token)['rooms']
            for room in current_rooms:
                # ignore finished conversations
                if room['remainingTime'] > 0:
                    room_id = room['uid']
                    if not self.chat_state[room_id]['initiated']:
                        # send a welcome message and get the alias of the agent in the chatroom
                        self.post_message(room_id=room_id, session_token=self.session_token, message=start_msg)
                        self.chat_state[room_id]['initiated'] = True
                        self.chat_state[room_id]['my_alias'] = room['alias']

                    # check for all messages
                    all_messages = self.check_room_state(room_id=room_id, since=0, session_token=self.session_token)['messages']

                    # you can also use ["reactions"] to get the reactions of the messages: STAR, THUMBS_UP, THUMBS_DOWN

                    for message in all_messages:
                        if message['authorAlias'] != self.chat_state[room_id]['my_alias']:

                            # check if the message is new
                            if message['ordinal'] not in self.chat_state[room_id]['messages']:
                                self.chat_state[room_id]['messages'][message['ordinal']] = message
                                print('\t- Chatroom {} - new message #{}: \'{}\' - {}'.format(room_id, message['ordinal'], message['message'], self.get_time()))

                                ##### You should call your agent here and get the response message #####
                                try:
                                    final_message = get_chat_answer([message['message']])
                                    self.post_message(room_id=room_id, session_token=self.session_token, message=final_message)
                                    time.sleep(listen_freq)
                                except:
                                    print('Exception')
                                    final_message = "Sorry, I did not get that. Please rephrase the question."
                                    self.post_message(room_id=room_id, session_token=self.session_token, message=final_message)
    

                                

    def login(self, username: str, password: str):
        agent_details = requests.post(url=url + "/api/login", json={"username": username, "password": password}).json()
        print('- User {} successfully logged in with session \'{}\'!'.format(agent_details['userDetails']['username'], agent_details['sessionToken']))
        return agent_details

    def check_rooms(self, session_token: str):
        return requests.get(url=url + "/api/rooms", params={"session": session_token}).json()

    def check_room_state(self, room_id: str, since: int, session_token: str):
        return requests.get(url=url + "/api/room/{}/{}".format(room_id, since), params={"roomId": room_id, "since": since, "session": session_token}).json()

    def post_message(self, room_id: str, session_token: str, message: str):
        tmp_des = requests.post(url=url + "/api/room/{}".format(room_id),
                                params={"roomId": room_id, "session": session_token}, data=message.encode('utf-8')).json()
        if tmp_des['description'] != 'Message received':
            print('\t\t Error: failed to post message: {}'.format(message))

    def get_time(self):
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())

    def logout(self):
        if requests.get(url=url + "/api/logout", params={"session": self.session_token}).json()['description'] == 'Logged out':
            print('- Session \'{}\' successfully logged out!'.format(self.session_token))


if __name__ == '__main__':
    # username = 'kirat.virmani_bot'
    with open("./credentials.json", "r") as f:
        credentials = json.load(f)
    username = credentials["username"]
    password = credentials["password"]
    # password = getpass.getpass('Password of the demo bot:')
    demobot = DemoBot(username, password)
    demobot.listen()






- User kirat.virmani_bot successfully logged in with session 'byuYiJbqUef-CmSBxdpdAor5A82Gu7l3'!
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #1: 'Who is the director of Good Will Hunting?' - 12:34:05, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #3: 'Who directed Rivers on Kwai' - 12:34:16, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #5: 'Who directed the bridge on the river Kwai' - 12:34:44, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #7: 'Who directed The Bridge on the River Kwai' - 12:35:01, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #9: 'Who is the director of Star Wars: Episode VI' - 12:35:16, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #11: 'Who is the director of Star Wars: Episode VI - Return of the Jedi?' - 12:35:25, 04-01-2023
	- Chatroom 5662015d-b320-4230-a56e-0d1306a4da39 - new message #13: 'Who is the scre

KeyboardInterrupt: 

In [None]:
# del demobot