# Dataset Introduction - the Project Knowledge Graph

Ruijie Wang, Pascal Severin Andermatt | 28-09-2022  
Matthias Baumgartner, Luca Rossetto, Cristina Sarasua | Dataset Construction

In [45]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import os
import numpy as np

import re

#NER
from transformers import pipeline, set_seed
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

import editdistance
import difflib

## 1. Dataset


### 1.1 Load the data

In [2]:
graph = rdflib.Graph()
graph.parse('./dataset/14_graph.nt', format='turtle')

<Graph identifier=N7bdaf6a1392e44cdb91877cb8d10ba6e (<class 'rdflib.graph.Graph'>)>

### 1.2 Graph Statistics

In [3]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [110]:
# pre-saved P values for Wikidata movies graph
global P_values, Q_values
P_values = {
    'director':'P57',
    'cast':'P161',
    'producer':'P162',
    'genre':'P136',
    'character':'P674',
    'screenwriter':'P58',   
    'filming location':'P915',
    'IMDB Id':'P345'  
    
    
}

Q_values = {
    'fictional human':'Q15632617',
    'film':'Q11424',
    'human':'Q5'
 
    
    
}

In [5]:
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s, URIRef)}
predicates = set(graph.predicates())
literals = {s for s in graph.objects() if isinstance(s, Literal)}
with_type = set(graph.subjects(WDT['P31'], None))
with_super = set(graph.subjects(WDT['P279'], None))
types = set(graph.objects(None, WDT['P31']))
supers = set(graph.objects(None, WDT['P279']))
with_label = set(graph.subjects(RDFS.label, None))

n_ents = len(entities)
n_rels = len(predicates)
n_lits = len(literals)
t_tot = len(graph)
t_ent = len([1 for s,p,o in graph.triples((None, None, None)) if isinstance(o, URIRef)])
t_lit = t_tot - t_ent
n_notype = len(entities - with_type - with_super)
n_notype_flt = len(entities - with_type - with_super - types - supers)

pd.DataFrame([
    ('number of entities', f'{n_ents:n}'),
    ('number of literals', f'{n_lits:n}'),
    ('number of predicates', f'{n_rels:n}'),
    ('number of triples', f'{t_tot:n}'),
    ('number of ent-ent triples', f'{t_ent:n}'),
    ('number of ent-lit triples', f'{t_lit:n}'),
    ('number of entities w/o label', f'{len(entities - with_label):n}'),
    ('number of predicates w/o label', f'{len(predicates - with_label):n}'),
    ('number of entities w/o type', f'{n_notype:n}'),
    ('number of instances w/o type', f'{n_notype_flt:n}'),
    ])

Unnamed: 0,0,1
0,number of entities,159154
1,number of literals,411590
2,number of predicates,255
3,number of triples,2056777
4,number of ent-ent triples,1498899
5,number of ent-lit triples,557878
6,number of entities w/o label,1095
7,number of predicates w/o label,2
8,number of entities w/o type,5761
9,number of instances w/o type,344


### 1.3 External Resource Statistics

In [6]:

top250 = set(open('../dataset/imdb-top-250.t').read().split('\n')) - {''}

pd.DataFrame([
    ('Top-250 coverage', '{:n}'.format(
        len(top250 & {str(o) for o in graph.objects(None, WDT.P345) if o.startswith('tt')}))),
    ('Entities with IMDb ID', '{:n}'.format(
        len({str(o) for o in graph.objects(None, WDT.P345) if o.startswith('tt')}))),
    ('Plots linked to a movie', '{:n}'.format(
        len({qid for qid, plot in csv.reader(open('../dataset/plots.csv', encoding='utf8')) if URIRef(qid) in entities}))),
    ('Comments linked to a movie', '{:n}'.format(
        len([qid for qid, rating, sentiment, comment in csv.reader(open('../dataset/user-comments.csv')) if URIRef(qid) in entities]))),
    ('Movies having at least one comment', '{:n}'.format(
        len({qid for qid, rating, sentiment, comment in csv.reader(open('../dataset/user-comments.csv')) if URIRef(qid) in entities}))), 
    ])

Unnamed: 0,0,1
0,Top-250 coverage,243
1,Entities with IMDb ID,27882
2,Plots linked to a movie,10366
3,Comments linked to a movie,26491
4,Movies having at least one comment,2454


### 1.4 Literal Statistics

In [7]:
# literal predicates
ent_lit_preds = {p for s,p,o in graph.triples((None, None, None)) if isinstance(o, Literal)}
ent_lit_preds

{rdflib.term.URIRef('http://ddis.ch/atai/rating'),
 rdflib.term.URIRef('http://ddis.ch/atai/tag'),
 rdflib.term.URIRef('http://schema.org/description'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'),
 rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P18'),
 rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P2142'),
 rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P345'),
 rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P577')}

In [8]:
# literal
pd.DataFrame([
    ('# entities', '{:n}'.format(
        len(entities))),
    ('DDIS.rating', '{:n}'.format(
        len(set(graph.subjects(DDIS.rating, None))))),
    ('DDIS.tag', '{:n}'.format(
        len(set(graph.subjects(DDIS.tag, None))))),
    ('SCHEMA.description', '{:n}'.format(
        len({s for s in graph.subjects(SCHEMA.description, None) if s.startswith(WD)}))),
    ('RDFS.label', '{:n}'.format(
        len({s for s in graph.subjects(RDFS.label, None) if s.startswith(WD)}))),
    ('WDT.P18 (wikicommons image)', '{:n}'.format(
        len(set(graph.subjects(WDT.P18, None))))),
    ('WDT.P2142 (box office)', '{:n}'.format(
        len(set(graph.subjects(WDT.P2142, None))))),
    ('WDT.P345 (IMDb ID)', '{:n}'.format(
        len(set(graph.subjects(WDT.P345, None))))),
    ('WDT.P577 (publication date)', '{:n}'.format(
        len(set(graph.subjects(WDT.P577, None))))),
    ])

Unnamed: 0,0,1
0,# entities,159154
1,DDIS.rating,2451
2,DDIS.tag,10366
3,SCHEMA.description,149753
4,RDFS.label,157806
5,WDT.P18 (wikicommons image),52828
6,WDT.P2142 (box office),1881
7,WDT.P345 (IMDb ID),123596
8,WDT.P577 (publication date),28642


### 1.5 Graph Connectivity

In [9]:
ng = nx.MultiDiGraph()
_ = ng.add_edges_from([
        (s, o, dict(pred=p)) for s,p,o in graph.triples((None, None, None)) if isinstance(o, URIRef)
        ])

comp = list(nx.connected_components(ng.to_undirected()))

print("number of connected graphs: {}".format(len(comp)))

number of connected graphs: 1


### 1.6 Node Degree Distribution

In [10]:
iplot(
  dict(
    data=[go.Histogram(x=[deg for _, deg in ng.degree() if deg < 100])],
    layout=go.Layout(
        title='Node degree distribution',
        xaxis=dict(
            title='Node degree'),
        yaxis=dict(
            title='Count'),
)))

In [11]:
pd.DataFrame([deg for _, deg in ng.degree()]).describe()

Unnamed: 0,0
count,158900.0
mean,18.865941
std,370.474405
min,1.0
25%,5.0
50%,8.0
75%,16.0
max,100432.0


### 2.7 Relation Distribution 

In [12]:
pdeg = defaultdict(int)
for s,p,o in graph.triples((None, None, None)):
    pdeg[p] += 1

iplot(dict(
    data=[go.Histogram(x=[cnt for cnt in pdeg.values() if cnt < 1000])],
    layout=go.Layout(
        title='Predicate distribution',
        xaxis=dict(
            title='Number of times the predicate is used'),
        yaxis=dict(
            title='Count'),
)))

In [13]:
pd.DataFrame(list(pdeg.values())).describe()

Unnamed: 0,0
count,255.0
mean,8065.792157
std,28238.112826
min,29.0
25%,94.5
50%,537.0
75%,2673.0
max,288856.0


### 2.8 Entity types

In [14]:
roots = {
    WD['Q8242']:        'literature',
    WD['Q5']:           'human',
    WD['Q483394']:      'genre',
    WD['Q95074']:       'character',
    WD['Q11424']:       'film',
    WD['Q15416']:       'tv',
    WD['Q618779']:      'award',
    WD['Q27096213']:    'geographic',
    WD['Q43229']:       'organisation',
    WD['Q34770']:       'language',
    WD['Q7725310']:     'series',
    WD['Q47461344']:    'written work',
}

In [15]:
# Histogram w.r.t what's in the actual published graph
ecats = defaultdict(set)
for s, o in graph.subject_objects(WDT.P31):
    c = roots.get(o, 'other')
    ecats[c].add(s)

chist = {c: len(ents) for c, ents in ecats.items()}
labels, values = zip(*chist.items())
iplot(dict(data=[go.Pie(labels=labels, values=values)]))

## 3. SPARQL query examples

P57 - director of film
P31 - instance of
Q11424 - film

In [79]:
def find_entity_given_label(entity_label, entity_type="none"):
    
    entity_label = "\"" + str(entity_label) + "\"@en"
    
    
  
    if entity_type == "none":
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {}
        }} """.format(entity_label)
    else:
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {} .
            ?entity wdt:P31 wd:{} .
        }} """.format(entity_label, entity_type)
        
    
#     print(query_content)
    res =  list(graph.query(query_content))
    if len(res)>0:
        return res[0][0]
    else:
        return -1

print(find_entity_given_label("MPAA film rating") )


print(find_entity_given_label("Forrest Gump",'Q11424') )
print(find_entity_given_label("director"))



http://www.wikidata.org/entity/P1657
http://www.wikidata.org/entity/Q134773
http://www.wikidata.org/prop/direct/P57


In [17]:
def query_something_about_movie(p_val, label):
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {{
     ?movie rdfs:label "{}"@en .
     ?movie wdt:P31 wd:Q11424 .
     ?movie wdt:{} ?answer
    }} """.format(label, p_val)
    
    print(query_content)
    return list(graph.query(query_content))
      
a = query_something_about_movie(P_values['director'], 'Forrest Gump' )    
  
for i in a:
    print(i)
    

PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wdt:P31 wd:Q11424 .
     ?movie wdt:P57 ?answer
    } 
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q187364'),)


In [18]:
def get_label_of_Qval(q_val):
    
    query_content =  """PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {{
                         <{}> rdfs:label ?label .
                         
                        }} """.format(q_val)
    
    print(query_content)
    return list(graph.query(query_content))

a = get_label_of_Qval('http://www.wikidata.org/entity/Q187364')
  
for i in a:
    print(i)
    
    

PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {
                         <http://www.wikidata.org/entity/Q187364> rdfs:label ?label .
                         
                        } 
(rdflib.term.Literal('Robert Zemeckis', lang='en'),)


In [100]:
def find_something_about_an_entity(entity_URI, relation_URI):
    
   
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?res WHERE {{
        <{}> <{}> ?res
        
    }} """.format(entity_URI, relation_URI)
    

    res =  list(graph.query(query_content))
    return res[0][0]
a = find_something_about_an_entity('http://www.wikidata.org/entity/Q134773','http://www.wikidata.org/prop/direct/P57') 

# for elements in a[0]:
#     print(elements)
print(a)

http://www.wikidata.org/entity/Q187364


In [20]:
# top user-rated movies
query_content = f'''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl WHERE {{
        SELECT ?movie ?lbl ?rating WHERE {{
            ?movie wdt:P31 wd:Q11424 .
            ?movie ddis:rating ?rating .
            ?movie rdfs:label ?lbl .
        }}
        ORDER BY DESC(?rating) 
        LIMIT 20
    }}
    '''
[str(s) for s, in graph.query(query_content)]

['Forrest Gump',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Heart of a Dog',
 'Kannathil Muthamittal',
 'Once Upon a Time in America',
 'Oldboy',
 'The Great Dictator',
 'Apocalypse Now',
 'The Shining',
 'The Cranes Are Flying',
 'Shwaas',
 'Khosla Ka Ghosla',
 'Inglourious Basterds',
 'Good Will Hunting',
 'Full Metal Jacket',
 'The Ascent',
 'War and Peace',
 '2001: A Space Odyssey',
 'Scarface',
 'Star Wars: Episode VI – Return of the Jedi']

In [None]:
query_content = '''
SELECT ?movie WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wd:P31 wdt:Q11424
     }
     '''
[str(s) for s, in graph.query(query_content)]

In [34]:
def write_list_to_file(list_name, file_name):
    with open(file_name, 'w', encoding="utf-8") as filehandle:
        for listitem in list_name:
            filehandle.write(f'{listitem}\n')
        
def read_list_from_file(file_name):
    res_list = []
    with open(file_name, 'r', encoding="utf-8") as filehandle:
        for line in filehandle:
            curr_place = line[:-1]
            res_list.append(curr_place)
    return res_list

def get_all_movies(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?movie rdfs:label ?label .
        ?movie wdt:P31 wd:Q11424 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
get_all_movies("save_files/all_movies_list.txt")

movies_list = []
movies_list = read_list_from_file("save_files/all_movies_list.txt")
print(len(movies_list))
print(movies_list[:5])

24384
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive']


In [54]:
def get_all_humans(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?person rdfs:label ?label .
        ?person wdt:P31 wd:Q5 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
get_all_humans("save_files/all_humans_list.txt")
humans_list = []
humans_list = read_list_from_file("save_files/all_humans_list.txt")
print(len(humans_list))
print(humans_list[:5])


100157
['Viktor Krištof', 'Yuji Nomi', 'Béatrice Thiriet', 'Oleg Kapanets', 'Ram Lee']
Synnøve Hørsdal


In [85]:
def find_closest_match_in_a_List(word, target_list):
    res = difflib.get_close_matches(word.lower(), [item.lower() for item in target_list], n=1, cutoff = 0.6)
    res_ind = -1
    
    if len(res)!=0:
        for i in range(len(target_list)):
            if (target_list[i].lower()) == res[0]:
                res_ind = i
                res = target_list[i]
    else:
        return -1
    
#     print(res)
#     print(res_ind)
    return {'res':res, 'res_ind':res_ind}
print(find_closest_match_in_a_List('BuffaloBill and the Indians', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', humans_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', []))

{'res': "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'res_ind': 2}
{'res': 'Béatrice Thiriet', 'res_ind': 2}
{'res': 'Triple Threat', 'res_ind': 10650}
-1


In [116]:
def deal_with_KG_query(entity, relation):
    
    # Process Entity 
    
    match_list = []
    entity_type = 'none'
    movie_res = find_closest_match_in_a_List(entity, movies_list)
    if movie_res != -1:
        match_list.append(movie_res['res'])
    human_res = find_closest_match_in_a_List(entity, humans_list)
    if human_res != -1:
        match_list.append(human_res['res'])

    final_entity_res = find_closest_match_in_a_List(entity, match_list)
    
    if final_entity_res == -1:
        final_entity_res = {'res':entity, 'res_ind' : -1}
        
    else:
        if (final_entity_res['res'] == human_res['res']):
            entity_type = 'human'
        elif (final_entity_res['res'] == movie_res['res']):
            entity_type = 'film'
        
#     print(final_entity_res['res'])
#     print(entity_type)
    entity_URI = find_entity_given_label(final_entity_res['res'], Q_values[entity_type])
    
    
    # Process Relation 
    
    entity_type = "none"
    relation_list = list(P_values.keys())
    
    final_relation_res = find_closest_match_in_a_List(relation, relation_list)
    if final_relation_res == -1:
        final_relation_res = {'res':relation, 'res_ind' : -1}
#     print(final_relation_res['res'])
#     print(entity_type)
    relation_URI = find_entity_given_label(final_relation_res['res'], entity_type)
    
#     print(relation_URI)
#     print(entity_URI)
    
    return find_something_about_an_entity(entity_URI, relation_URI)
    
    
    
 
res = deal_with_KG_query('Forest Gump','dicrector')
print(res)

res = deal_with_KG_query('Forest Gump','nominated for')
print(res)

http://www.wikidata.org/entity/Q187364
http://www.wikidata.org/entity/Q102427


In [109]:
find_entity_given_label('Forrest Gump', Q_values['film'])

rdflib.term.URIRef('http://www.wikidata.org/entity/Q134773')

In [16]:
# bottom user-rated movies
[str(s) for s, in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl WHERE {
        SELECT ?movie ?lbl ?rating WHERE {
            ?movie wdt:P31 wd:Q11424 .
            ?movie ddis:rating ?rating .
            ?movie rdfs:label ?lbl .
        }
        ORDER BY ASC(?rating) 
        LIMIT 10
    }
    ''')]

['Vampire Assassin',
 'Vampires vs. Zombies',
 'Aag',
 'Joystick Nation – Generation Hip Hop',
 'Going Overboard',
 "Alex l'ariete",
 'House of the Dead',
 'Killers',
 "Ghosts Can't Do It",
 'Snakes on a Train']

In [44]:
# some info about a Apocalypse Now

header = '''
    PREFIX ddis: <http://ddis.ch/atai/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX schema: <http://schema.org/>
'''

movie_name = "\"Apocalypse Now\"@en"
query = header + '''
    SELECT * WHERE {{
        ?movie rdfs:label {} .
        ?movie wdt:P57/rdfs:label ?director .
        OPTIONAL {{ ?movie ddis:rating ?rating }} .
        OPTIONAL {{ ?movie wdt:P577 ?value}}
    }}
    '''.format(movie_name)
print(query)
tuple_list = list(graph.query(query))


first_tuple = tuple_list[0]

print(f"First tuple: {first_tuple}")
print('------------')

for elements in first_tuple:
    print(elements)
    print(type(elements))


    PREFIX ddis: <http://ddis.ch/atai/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX schema: <http://schema.org/>

    SELECT * WHERE {
        ?movie rdfs:label "Apocalypse Now"@en .
        ?movie wdt:P57/rdfs:label ?director .
        OPTIONAL { ?movie ddis:rating ?rating } .
        OPTIONAL { ?movie wdt:P577 ?value}
    }
    
First tuple: (rdflib.term.Literal('1979-05-10', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#date')), rdflib.term.Literal('Francis Ford Coppola', lang='en'), rdflib.term.URIRef('http://www.wikidata.org/entity/Q182692'), rdflib.term.Literal('8.4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
------------
1979-05-10
<class 'rdflib.term.Literal'>
Francis Ford Coppola
<class 'rdflib.term.Literal'>
http://www.wikidata.org/entity/Q182692
<class 'rdflib.term.URIRef'>
8.4
<class 'rdflib.term.Literal'>


In [18]:
# dealing with optional parameters
tuple_list = list(graph.query(header + '''
    SELECT ?lbl ?rating WHERE {
        ?movie rdfs:label ?lbl .
        ?movie wdt:P57/rdfs:label ?director .
        OPTIONAL { ?movie ddis:rating ?rating } .
        OPTIONAL { ?movie wdt:P577 ?value}
    }
    LIMIT 10
    '''))

# unpacking the tuple in the loop
for (movie_label, rating) in tuple_list:
    if rating:
        print(f"{movie_label} has a rating of {rating} ⭐️")
    else:
        print(f"{movie_label} has no rating 😔")

Jan Dara has no rating 😔
Queens of Langkasuka has no rating 😔
Three has no rating 😔
Moondram Pirai has no rating 😔
Buffalo Bill and the Indians, or Sitting Bull's History Lesson has no rating 😔
Dr. T & the Women has no rating 😔
McCabe & Mrs. Miller has a rating of 7.7 ⭐️
Nashville has no rating 😔
Fool for Love has a rating of 6.1 ⭐️
The Gingerbread Man has a rating of 5.7 ⭐️


In [19]:
# all movies directed by Terry Gilliam
[str(s) for s,  in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl WHERE {
        ?director rdfs:label "Terry Gilliam"@en .
        ?movie wdt:P57 ?director .
        ?movie rdfs:label ?lbl
    }
    ''')]

['Time Bandits',
 'Tideland',
 "Monty Python's The Meaning of Life",
 'Brazil',
 'The Wholly Family',
 'The Fisher King',
 'The Imaginarium of Doctor Parnassus',
 'The Zero Theorem',
 'The Adventures of Baron Munchausen',
 'Jabberwocky',
 '12 Monkeys',
 'The Man Who Killed Don Quixote',
 'The Brothers Grimm',
 'Fear and Loathing in Las Vegas',
 'Monty Python and the Holy Grail']

In [20]:
# neo-noir movies featuring Ryan Gosling
[str(s) for s, in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl WHERE {
        ?genre rdfs:label "neo-noir"@en .
        ?actor rdfs:label "Ryan Gosling"@en .
        ?movie wdt:P136 ?genre .
        ?movie wdt:P161 ?actor .
        ?movie rdfs:label ?lbl .
    }
    ''')]

['Only God Forgives', 'Drive', 'Blade Runner 2049']

In [21]:
# movies with largest cast member list
[(str(s), int(nc)) for s, nc in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl ?nc WHERE {
        SELECT ?movie ?lbl (count(?cast) as ?nc) WHERE {
            ?movie wdt:P31 wd:Q11424 .
            ?movie rdfs:label ?lbl .
            ?movie wdt:P161 ?cast .
        }
        GROUP BY ?movie
        ORDER BY DESC(?nc)
        LIMIT 10
    }
    ''')]

[('Mamma Mia! Here We Go Again', 224),
 ('Ali', 121),
 ('Forrest Gump', 118),
 ('Terror in the Aisles', 110),
 ('Iron Man 3', 108),
 ('The Longest Day', 104),
 ('Avengers: Endgame', 100),
 ('Captain America: Civil War', 98),
 ('Around the World in 80 Days', 93),
 ('Captain America: The First Avenger', 93)]

In [22]:
# cast of Moon
[str(s) for s, in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?lbl WHERE {
        ?movie rdfs:label "Moon"@en .
        ?movie wdt:P161 ?cast .
        ?cast rdfs:label ?lbl .
    }
    ''')]

['Matt Berry',
 'Kaya Scodelario',
 'Kevin Spacey',
 'Sam Rockwell',
 'Benedict Wong',
 'Dominique McElligott',
 'Robin Chalk']

In [23]:
# winners of Cannes best movie (Palme d'Or)
a = [(str(d), str(s)) for s, d in graph.query(header + '''
    SELECT ?lbl ?pubdate WHERE {
        ?award rdfs:label "Palme d'Or"@en .
        ?movie wdt:P166 ?award .
        ?movie rdfs:label ?lbl .
        ?movie wdt:P577 ?pubdate .
        FILTER (?pubdate > "2011-01-01"^^xsd:date)
    }
    ORDER BY DESC(?pubdate)
    ''')]

# this can be also written as (notice the ";"):
b = [(str(d), str(s)) for s, d in graph.query(header + '''
    SELECT ?lbl ?pubdate WHERE {
      ?award rdfs:label "Palme d'Or"@en.
      ?movie wdt:P166 ?award; rdfs:label ?lbl; wdt:P577 ?pubdate.
      FILTER(?pubdate > "2011-01-01"^^xsd:date)
    }
    ORDER BY DESC (?pubdate)
    ''')]

assert (a == b)
a

[('2021-07-14', 'Titane'),
 ('2019-05-21', 'Parasite'),
 ('2018-05-13', 'Shoplifters'),
 ('2017-05-20', 'The Square'),
 ('2016-10-21', 'I, Daniel Blake'),
 ('2015-10-22', 'Dheepan'),
 ('2014-05-16', 'Winter Sleep'),
 ('2013-05-23', 'Blue Is the Warmest Colour'),
 ('2012-01-01', 'Amour'),
 ('2011-05-16', 'The Tree of Life')]

# Processing question


In [32]:
sample_questions = ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the Director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?"]

In [37]:
questions_df = [{"query": s, "type" : ""}for s in sample_questions]
questions_df

[{'query': 'Who is the director of Good Will Hunting?', 'type': ''},
 {'query': 'Who directed The Bridge on the River Kwai?', 'type': ''},
 {'query': 'Who is the director of Star Wars: Episode VI - Return of the Jedi?',
  'type': ''},
 {'query': 'Who is the screenwriter of The Masked Gang: Cyprus?', 'type': ''},
 {'query': 'What is the MPAA film rating of Weathering with You?', 'type': ''},
 {'query': 'What is the genre of Good Neighbors?', 'type': ''},
 {'query': 'Show me a picture of Halle Berry.', 'type': ''},
 {'query': 'What does Julia Roberts look like?', 'type': ''},
 {'query': 'Let me know what Sandra Bullock looks like.', 'type': ''},
 {'query': 'Recommend movies similar to Hamlet and Othello.', 'type': ''},
 {'query': 'Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?',
  'type': ''},
 {'query': 'Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.',
  'type': ''},
 {'query': 'Can you tell me th

## Pattern Matching

## Name Entity Recognition


In [34]:
# Using BERT for NER


In [35]:
tokenizer_POS = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
model_POS = model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

In [40]:
from transformers import pipeline

# Replace this with your own checkpoint"
token_classifier = pipeline(
    "token-classification", model=model_POS, aggregation_strategy="simple", tokenizer = tokenizer_POS
)

# POS tagging for all of the questions
for i in range (len(questions_df)):
    questions_df[i]["pos"] = token_classifier(questions_df[i]["query"])
questions_df[0]

{'query': 'Who is the director of Good Will Hunting?',
 'type': '',
 'pos': [{'entity_group': 'PRON',
   'score': 0.99944025,
   'word': 'who',
   'start': 0,
   'end': 3},
  {'entity_group': 'AUX',
   'score': 0.9970728,
   'word': 'is',
   'start': 4,
   'end': 6},
  {'entity_group': 'DET',
   'score': 0.99954873,
   'word': 'the',
   'start': 7,
   'end': 10},
  {'entity_group': 'NOUN',
   'score': 0.986273,
   'word': 'director',
   'start': 11,
   'end': 19},
  {'entity_group': 'ADP',
   'score': 0.9994654,
   'word': 'of',
   'start': 20,
   'end': 22},
  {'entity_group': 'PROPN',
   'score': 0.6715452,
   'word': 'good',
   'start': 23,
   'end': 27},
  {'entity_group': 'NOUN',
   'score': 0.6608364,
   'word': 'will',
   'start': 28,
   'end': 32},
  {'entity_group': 'PROPN',
   'score': 0.77655363,
   'word': 'hunting',
   'start': 33,
   'end': 40},
  {'entity_group': 'PUNCT',
   'score': 0.9996462,
   'word': '?',
   'start': 40,
   'end': 41}]}

In [53]:
# Find the type of question by keyword matching
def find_type(formulated_question_df):
    keywords_images = [ 'image', 'picture', 'look', 'looks' ]
    keywords_recommendation = ['similar', 'recommend', 'recommendations']
    res_type = ''
    query_list = []
    for i in formulated_question_df['pos']:
        query_list.append(i['word']) 
    if any(word in query_list for word in keywords_images):
        res_type = "images"
    elif any(word in query_list for word in keywords_recommendation):
        res_type = "recommendation"
    else :
        res_type = "search"
            
    return res_type

# Add the type for all questions
for i in range (len(questions_df)):
    questions_df[i]['type'] = find_type(questions_df[i])
    
#     print(questions_df[i]['query']) 
#     print(questions_df[i]['type']) 
#     print("______")

In [90]:
questions_df[1]

{'query': 'Who directed The Bridge on the River Kwai?',
 'type': 'search',
 'pos': [{'entity_group': 'PRON',
   'score': 0.9994,
   'word': 'who',
   'start': 0,
   'end': 3},
  {'entity_group': 'VERB',
   'score': 0.99945265,
   'word': 'directed',
   'start': 4,
   'end': 12},
  {'entity_group': 'DET',
   'score': 0.99950266,
   'word': 'the',
   'start': 13,
   'end': 16},
  {'entity_group': 'NOUN',
   'score': 0.9979869,
   'word': 'bridge',
   'start': 17,
   'end': 23},
  {'entity_group': 'ADP',
   'score': 0.9993905,
   'word': 'on',
   'start': 24,
   'end': 26},
  {'entity_group': 'DET',
   'score': 0.99918383,
   'word': 'the',
   'start': 27,
   'end': 30},
  {'entity_group': 'PROPN',
   'score': 0.9451487,
   'word': 'river kwai',
   'start': 31,
   'end': 41},
  {'entity_group': 'PUNCT',
   'score': 0.9996606,
   'word': '?',
   'start': 41,
   'end': 42}]}

In [86]:
# find the entity for a question of type images
def get_entity_for_images(formulated_question_df):
    adp_ind = -1

    for i ,pos_res in enumerate(formulated_question_df['pos']):
        if pos_res['entity_group'] == 'PROPN':
            return pos_res['word']
    return -1
    

In [89]:
# main function to answer image questions
def handle_image_questions(formulated_question_df):
    if formulated_question_df['type']!='images':
        return -1
    else:
        name = get_entity_for_images(formulated_question_df)
        print(name)
        
        # query to get image of name
        
for q in questions_df:
    handle_image_questions(q)

halle berry
julia roberts
sandra bullock


In [None]:
def get_entity_relation_for_search(formulated_question_df):
    

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
seqeval = evaluate.load("seqeval")
# labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [11]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [12]:
model = AutoModelForTokenClassification.from_pretrained(    "vblagoje/bert-english-uncased-finetuned-pos")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [20]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3394
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 426
  Number of trainable parameters = 66372877


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.26237,0.536776,0.317887,0.399302,0.943226
2,No log,0.272362,0.562033,0.348471,0.430206,0.945449


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
Saving model checkpoint to my_awesome_wnut_model\checkpoint-213
Configuration saved in my_awesome_wnut_model\checkpoint-213\config.json
Model weights saved in my_awesome_wnut_model\checkpoint-213\pytorch_model.bin
tokenizer config file saved in my_awesome_wnut_model\checkpoint-213\tokenizer_config.json
Special tokens file saved in my_awesome_wnut_model\checkpoint-213\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClass

TrainOutput(global_step=426, training_loss=0.10359365615486539, metrics={'train_runtime': 18.7858, 'train_samples_per_second': 361.336, 'train_steps_per_second': 22.677, 'total_flos': 92090981263080.0, 'train_loss': 0.10359365615486539, 'epoch': 2.0})

In [None]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
classifier(text)

In [47]:
entities = ner_pipeline(sample_question, aggregation_strategy="simple")
for entity in entities:
    print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.2f})")

Batman: ORG (0.92)


## Getting Relation

In [32]:

# a naive way for matching entities and relations

question_pattern = "who is the (.*) of ENTITY"

print("question pattern: {}\n".format(question_pattern))

question = re.sub(entity, "ENTITY", question.rstrip("?"))  # preprocess the question

relation = re.match(question_pattern, question).group(1)  # match the relation using a pattern

print("recognized relation: {}\n".format(relation))

question pattern: who is the (.*) of ENTITY



NameError: name 'entity' is not defined

## Node Matching


In [None]:
nodes = {}
predicates = {}

for node in g.all_nodes():
    if isinstance(node, URIRef):
        if g.value(node, n.label):
            nodes[node.toPython()] = g.value(node, n.label).toPython()
        else:
            nodes[node.toPython()] = re.sub("http://example.org/", "", node.toPython())

for s, p, o in g:
    predicates[p.toPython()] = re.sub("http://example.org/", "", p.toPython())

print("labeled nodes: {}\n".format(nodes))
print("predicates: {}\n".format(predicates))

In [None]:

tmp = 9999
match_node = ""
print("--- entity matching for \"{}\"\n".format(entity))
for key, value in nodes.items():
    print("edit distance between {} and {}: {}".format(value, entity, editdistance.eval(value, entity)))
    if editdistance.eval(value, entity) < tmp:
        tmp = editdistance.eval(value, entity)
        match_node = key

tmp = 9999
match_pred = ""
print("\n--- relation matching for \"{}\"\n".format(relation))
for key, value in predicates.items():
    print("edit distance between {} and {}: {}".format(value, relation, editdistance.eval(value, relation)))
    if editdistance.eval(value, relation) < tmp:
        tmp = editdistance.eval(value, relation)
        match_pred = key

print("\n--- the matching node of \"{}\" is {}\n".format(entity, match_node))
print("--- the matching predicates of \"{}\" is {}\n".format(relation, match_pred))

# Query

## Query Generation

In [None]:
query_template = "SELECT DISTINCT ?x ?y WHERE {{ ?x <{}> <{}>. ?x <{}> ?y. }}".format(match_pred, match_node, n.label)

print("--- sparql query: {}".format(query_template))

qres = g.query(query_template)

print("\n--- querying results: ")
for row in qres:
    print(row.x, row.y)
    answer = row.y

## Answer Sentence Generation

In [5]:
images_json = pd.read_json(os.path.join('..','dataset/movienet/images.json')) 

In [6]:
images_json.head()

Unnamed: 0,w,movie,img,h,type,cast
0,1666,[tt4882376],0315/rm601699072.jpg,1000,behind_the_scenes,"[nm8801745, nm0001401]"
1,999,[tt2318625],2538/rm814292736.jpg,562,still_frame,[nm2072214]
2,1500,[tt4003966],0354/rm2068192512.jpg,1000,still_frame,[nm0268626]
3,1333,[],3777/rm27402752.jpg,1000,publicity,[nm6655379]
4,704,[],3459/rm537652736.jpg,1000,event,"[nm1577190, nm7097953]"
