# Dataset Introduction - the Project Knowledge Graph

Ruijie Wang, Pascal Severin Andermatt | 28-09-2022  
Matthias Baumgartner, Luca Rossetto, Cristina Sarasua | Dataset Construction

In [6]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import os
import numpy as np

import re

#NER
from transformers import pipeline, set_seed
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

import editdistance
import difflib

## 1. Dataset


### 1.1 Load the data

In [7]:
graph = rdflib.Graph()
graph.parse('./dataset/14_graph.nt', format='turtle')

<Graph identifier=N8b5f0edc53f64084b28e0ea676b2217e (<class 'rdflib.graph.Graph'>)>

### 1.2 Graph Statistics

In [8]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [37]:
# pre-saved P values for Wikidata movies graph
global P_values, Q_values
P_values = {
    'director':'P57',
    'cast':'P161',
    'producer':'P162',
    'genre':'P136',
    'character':'P674',
    'screenwriter':'P58',   
    'filming location':'P915',
    'IMDB Id':'P345',
    'image':'P18'
    
    
}

Q_values = {
    'fictional human':'Q15632617',
    'film':'Q11424',
    'human':'Q5'
 
    
    
}

### 1.3 External Resource Statistics

In [10]:

top250 = set(open('../dataset/imdb-top-250.t').read().split('\n')) - {''}


### 1.4 Literal Statistics

In [11]:
roots = {
    WD['Q8242']:        'literature',
    WD['Q5']:           'human',
    WD['Q483394']:      'genre',
    WD['Q95074']:       'character',
    WD['Q11424']:       'film',
    WD['Q15416']:       'tv',
    WD['Q618779']:      'award',
    WD['Q27096213']:    'geographic',
    WD['Q43229']:       'organisation',
    WD['Q34770']:       'language',
    WD['Q7725310']:     'series',
    WD['Q47461344']:    'written work',
}

## 3. SPARQL query examples

P57 - director of film
P31 - instance of
Q11424 - film

In [12]:
def find_entity_given_label(entity_label, entity_type="none"):
    
    entity_label = "\"" + str(entity_label) + "\"@en"
    
    
  
    if entity_type == "none":
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {}
        }} """.format(entity_label)
    else:
        query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
        PREFIX wd: <http://www.wikidata.org/entity/> 
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        PREFIX schema: <http://schema.org/> 

        SELECT ?entity WHERE {{
            ?entity rdfs:label {} .
            ?entity wdt:P31 wd:{} .
        }} """.format(entity_label, entity_type)
        
    
#     print(query_content)
    res =  list(graph.query(query_content))
    if len(res)>0:
        return res[0][0]
    else:
        return -1

print(find_entity_given_label("MPAA film rating") )


print(find_entity_given_label("Forrest Gump",'Q11424') )
print(find_entity_given_label("director"))



http://www.wikidata.org/entity/P1657
http://www.wikidata.org/entity/Q134773
http://www.wikidata.org/prop/direct/P57


In [13]:
def query_something_about_movie(p_val, label):
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {{
     ?movie rdfs:label "{}"@en .
     ?movie wdt:P31 wd:Q11424 .
     ?movie wdt:{} ?answer
    }} """.format(label, p_val)
    
    print(query_content)
    return list(graph.query(query_content))
      
a = query_something_about_movie(P_values['director'], 'Forrest Gump' )    
  
for i in a:
    print(i)
    

PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?answer WHERE {
     ?movie rdfs:label "Forrest Gump"@en .
     ?movie wdt:P31 wd:Q11424 .
     ?movie wdt:P57 ?answer
    } 
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q187364'),)


In [14]:
def get_label_of_Qval(q_val):
    
    query_content =  """PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {{
                         <{}> rdfs:label ?label .
                         
                        }} """.format(q_val)
    
    print(query_content)
    return list(graph.query(query_content))

a = get_label_of_Qval('http://www.wikidata.org/entity/Q187364')
  
for i in a:
    print(i)
    
    

PREFIX ddis: <http://ddis.ch/atai/> 
                        PREFIX wd: <http://www.wikidata.org/entity/> 
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                        PREFIX schema: <http://schema.org/> 

                        SELECT ?label WHERE {
                         <http://www.wikidata.org/entity/Q187364> rdfs:label ?label .
                         
                        } 
(rdflib.term.Literal('Robert Zemeckis', lang='en'),)


In [15]:
def find_something_about_an_entity(entity_URI, relation_URI):
    
   
    
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?res WHERE {{
        <{}> <{}> ?res
        
    }} """.format(entity_URI, relation_URI)
    

    res =  list(graph.query(query_content))
    return res[0][0]
a = find_something_about_an_entity('http://www.wikidata.org/entity/Q134773','http://www.wikidata.org/prop/direct/P57') 

# for elements in a[0]:
#     print(elements)
print(a)

http://www.wikidata.org/entity/Q187364


In [20]:
def write_list_to_file(list_name, file_name):
    with open(file_name, 'w', encoding="utf-8") as filehandle:
        for listitem in list_name:
            filehandle.write(f'{listitem}\n')
        
def read_list_from_file(file_name):
    res_list = []
    with open(file_name, 'r', encoding="utf-8") as filehandle:
        for line in filehandle:
            curr_place = line[:-1]
            res_list.append(curr_place)
    return res_list

def get_all_movies(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?movie rdfs:label ?label .
        ?movie wdt:P31 wd:Q11424 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
get_all_movies("save_files/all_movies_list.txt")

movies_list = []
movies_list = read_list_from_file("save_files/all_movies_list.txt")
print(len(movies_list))
print(movies_list[:5])

24384
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive']


In [21]:
def get_all_humans(write_file_pathname):
    query_content = """PREFIX ddis: <http://ddis.ch/atai/> 
    PREFIX wd: <http://www.wikidata.org/entity/> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    PREFIX schema: <http://schema.org/> 
    
    SELECT ?label WHERE {{
        ?person rdfs:label ?label .
        ?person wdt:P31 wd:Q5 .
        
    }}
    """
    

    res = list(graph.query(query_content))
    res_list = []
    for i in res:
        res_list.append(str(i[0]))
    write_list_to_file(res_list, write_file_pathname)
    
get_all_humans("save_files/all_humans_list.txt")
humans_list = []
humans_list = read_list_from_file("save_files/all_humans_list.txt")
print(len(humans_list))
print(humans_list[:5])


100157
['Viktor Krištof', 'Yuji Nomi', 'Béatrice Thiriet', 'Oleg Kapanets', 'Ram Lee']


In [22]:
def find_closest_match_in_a_List(word, target_list):
    res = difflib.get_close_matches(word.lower(), [item.lower() for item in target_list], n=1, cutoff = 0.6)
    res_ind = -1
    
    if len(res)!=0:
        for i in range(len(target_list)):
            if (target_list[i].lower()) == res[0]:
                res_ind = i
                res = target_list[i]
    else:
        return -1
    
#     print(res)
#     print(res_ind)
    return {'res':res, 'res_ind':res_ind}
print(find_closest_match_in_a_List('BuffaloBill and the Indians', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', humans_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', movies_list))
print(find_closest_match_in_a_List('Beatrice Thiriet', []))

{'res': "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'res_ind': 2}
{'res': 'Béatrice Thiriet', 'res_ind': 2}
{'res': 'Triple Threat', 'res_ind': 10650}
-1


In [23]:
def deal_with_KG_query(entity, relation):
    
    # Process Entity 
    
    match_list = []
    entity_type = 'none'
    movie_res = find_closest_match_in_a_List(entity, movies_list)
    if movie_res != -1:
        match_list.append(movie_res['res'])
    human_res = find_closest_match_in_a_List(entity, humans_list)
    if human_res != -1:
        match_list.append(human_res['res'])

    final_entity_res = find_closest_match_in_a_List(entity, match_list)
    
    if final_entity_res == -1:
        final_entity_res = {'res':entity, 'res_ind' : -1}
        
    else:
        if (final_entity_res['res'] == human_res['res']):
            entity_type = 'human'
        elif (final_entity_res['res'] == movie_res['res']):
            entity_type = 'film'
        
#     print(final_entity_res['res'])
#     print(entity_type)
    entity_URI = find_entity_given_label(final_entity_res['res'], Q_values[entity_type])
    
    
    # Process Relation 
    
    entity_type = "none"
    relation_list = list(P_values.keys())
    
    final_relation_res = find_closest_match_in_a_List(relation, relation_list)
    if final_relation_res == -1:
        final_relation_res = {'res':relation, 'res_ind' : -1}
#     print(final_relation_res['res'])
#     print(entity_type)
    relation_URI = find_entity_given_label(final_relation_res['res'], entity_type)
    
#     print(relation_URI)
#     print(entity_URI)
    
    return find_something_about_an_entity(entity_URI, relation_URI)
    
    
    
 
res = deal_with_KG_query('Forest Gump','dicrector')
print(res)

res = deal_with_KG_query('Forest Gump','nominated for')
print(res)

http://www.wikidata.org/entity/Q187364
http://www.wikidata.org/entity/Q102427


In [24]:
find_entity_given_label('Forrest Gump', Q_values['film'])

rdflib.term.URIRef('http://www.wikidata.org/entity/Q134773')

In [38]:
print(WDT['P18'])

http://www.wikidata.org/prop/direct/P18


# Processing question


In [25]:
sample_questions = ["Who is the director of Good Will Hunting?", "Who directed The Bridge on the River Kwai?", 
                    "Who is the director of Star Wars: Episode VI - Return of the Jedi?", "Who is the screenwriter of The Masked Gang: Cyprus?",
                    "What is the MPAA film rating of Weathering with You?", "What is the genre of Good Neighbors?", "Show me a picture of Halle Berry.",
                    "What does Julia Roberts look like?", "Let me know what Sandra Bullock looks like.", "Recommend movies similar to Hamlet and Othello.",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "Can you tell me the publication date of Tom Meets Zizou?", "Who is the executive producer of X-Men: First Class?",
                    "Who is the Director of Batman 1989?", "What is the box office of The Princess and the Frog?",
                   "What is the birthplace of Christopher Nolan?"]

In [26]:
questions_df = [{"query": s, "type" : ""}for s in sample_questions]
questions_df

[{'query': 'Who is the director of Good Will Hunting?', 'type': ''},
 {'query': 'Who directed The Bridge on the River Kwai?', 'type': ''},
 {'query': 'Who is the director of Star Wars: Episode VI - Return of the Jedi?',
  'type': ''},
 {'query': 'Who is the screenwriter of The Masked Gang: Cyprus?', 'type': ''},
 {'query': 'What is the MPAA film rating of Weathering with You?', 'type': ''},
 {'query': 'What is the genre of Good Neighbors?', 'type': ''},
 {'query': 'Show me a picture of Halle Berry.', 'type': ''},
 {'query': 'What does Julia Roberts look like?', 'type': ''},
 {'query': 'Let me know what Sandra Bullock looks like.', 'type': ''},
 {'query': 'Recommend movies similar to Hamlet and Othello.', 'type': ''},
 {'query': 'Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?',
  'type': ''},
 {'query': 'Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.',
  'type': ''},
 {'query': 'Can you tell me th

## Pattern Matching

## Name Entity Recognition


In [27]:
tokenizer_POS = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
model_POS = model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

In [28]:
from transformers import pipeline

# Replace this with your own checkpoint"
token_classifier = pipeline(
    "token-classification", model=model_POS, aggregation_strategy="simple", tokenizer = tokenizer_POS
)

# POS tagging for all of the questions
for i in range (len(questions_df)):
    questions_df[i]["pos"] = token_classifier(questions_df[i]["query"])
questions_df[0]

{'query': 'Who is the director of Good Will Hunting?',
 'type': '',
 'pos': [{'entity_group': 'PRON',
   'score': 0.99944025,
   'word': 'who',
   'start': 0,
   'end': 3},
  {'entity_group': 'AUX',
   'score': 0.9970728,
   'word': 'is',
   'start': 4,
   'end': 6},
  {'entity_group': 'DET',
   'score': 0.99954873,
   'word': 'the',
   'start': 7,
   'end': 10},
  {'entity_group': 'NOUN',
   'score': 0.986273,
   'word': 'director',
   'start': 11,
   'end': 19},
  {'entity_group': 'ADP',
   'score': 0.9994654,
   'word': 'of',
   'start': 20,
   'end': 22},
  {'entity_group': 'PROPN',
   'score': 0.6715452,
   'word': 'good',
   'start': 23,
   'end': 27},
  {'entity_group': 'NOUN',
   'score': 0.6608364,
   'word': 'will',
   'start': 28,
   'end': 32},
  {'entity_group': 'PROPN',
   'score': 0.77655363,
   'word': 'hunting',
   'start': 33,
   'end': 40},
  {'entity_group': 'PUNCT',
   'score': 0.9996462,
   'word': '?',
   'start': 40,
   'end': 41}]}

In [29]:
# Find the type of question by keyword matching
def find_type(formulated_question_df):
    keywords_images = [ 'image', 'picture', 'look', 'looks' ]
    keywords_recommendation = ['similar', 'recommend', 'recommendations']
    res_type = ''
    query_list = []
    for i in formulated_question_df['pos']:
        query_list.append(i['word']) 
    if any(word in query_list for word in keywords_images):
        res_type = "images"
    elif any(word in query_list for word in keywords_recommendation):
        res_type = "recommendation"
    else :
        res_type = "search"
            
    return res_type

# Add the type for all questions
for i in range (len(questions_df)):
    questions_df[i]['type'] = find_type(questions_df[i])
    
#     print(questions_df[i]['query']) 
#     print(questions_df[i]['type']) 
#     print("______")

In [30]:
questions_df[1]

{'query': 'Who directed The Bridge on the River Kwai?',
 'type': 'search',
 'pos': [{'entity_group': 'PRON',
   'score': 0.9994,
   'word': 'who',
   'start': 0,
   'end': 3},
  {'entity_group': 'VERB',
   'score': 0.99945265,
   'word': 'directed',
   'start': 4,
   'end': 12},
  {'entity_group': 'DET',
   'score': 0.99950266,
   'word': 'the',
   'start': 13,
   'end': 16},
  {'entity_group': 'NOUN',
   'score': 0.9979869,
   'word': 'bridge',
   'start': 17,
   'end': 23},
  {'entity_group': 'ADP',
   'score': 0.9993905,
   'word': 'on',
   'start': 24,
   'end': 26},
  {'entity_group': 'DET',
   'score': 0.99918383,
   'word': 'the',
   'start': 27,
   'end': 30},
  {'entity_group': 'PROPN',
   'score': 0.9451487,
   'word': 'river kwai',
   'start': 31,
   'end': 41},
  {'entity_group': 'PUNCT',
   'score': 0.9996606,
   'word': '?',
   'start': 41,
   'end': 42}]}

In [31]:
# find the entity for a question of type images
def get_entity_for_images(formulated_question_df):
    adp_ind = -1

    for i ,pos_res in enumerate(formulated_question_df['pos']):
        if pos_res['entity_group'] == 'PROPN':
            return pos_res['word']
    return -1
    

In [40]:
# main function to answer image questions
def handle_image_questions(formulated_question_df):
    if formulated_question_df['type']!='images':
        return -1
    else:
        name = get_entity_for_images(formulated_question_df)
       
        
    match_list = []
    entity_type = 'none'
    movie_res = find_closest_match_in_a_List(name, movies_list)
    if movie_res != -1:
        match_list.append(movie_res['res'])
    human_res = find_closest_match_in_a_List(name, humans_list)
    if human_res != -1:
        match_list.append(human_res['res'])

    final_entity_res = find_closest_match_in_a_List(name, match_list)
    
    if final_entity_res == -1:
        final_entity_res = {'res':entity, 'res_ind' : -1}
        
    else:
        if (final_entity_res['res'] == human_res['res']):
            entity_type = 'human'
        elif (final_entity_res['res'] == movie_res['res']):
            entity_type = 'film'
    print(final_entity_res['res'])
    entity_URI = find_entity_given_label(final_entity_res['res'], Q_values[entity_type])
    
    print(entity_URI)
    
    return find_something_about_an_entity(entity_URI, WDT['P18'])
    
    
for q in questions_df:
    res = handle_image_questions(q)
    print(res)

-1
-1
-1
-1
-1
-1
Halle Berry
http://www.wikidata.org/entity/Q1033016
https://commons.wikimedia.org/wiki/File:Halle_Berry_by_Gage_Skidmore_2.jpg
Julia Roberts
http://www.wikidata.org/entity/Q40523
https://commons.wikimedia.org/wiki/File:Julia_Roberts_(43838880775).jpg
Sandra Bullock
http://www.wikidata.org/entity/Q40791
https://commons.wikimedia.org/wiki/File:Sandra_Bullock,_The_Heat,_London,_2013_(crop).jpg
-1
-1
-1
-1
-1
-1
-1
-1


In [None]:
def get_entity_relation_for_search(formulated_question_df):
    

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
seqeval = evaluate.load("seqeval")
# labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [11]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [12]:
model = AutoModelForTokenClassification.from_pretrained(    "vblagoje/bert-english-uncased-finetuned-pos")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [20]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3394
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 426
  Number of trainable parameters = 66372877


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.26237,0.536776,0.317887,0.399302,0.943226
2,No log,0.272362,0.562033,0.348471,0.430206,0.945449


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
Saving model checkpoint to my_awesome_wnut_model\checkpoint-213
Configuration saved in my_awesome_wnut_model\checkpoint-213\config.json
Model weights saved in my_awesome_wnut_model\checkpoint-213\pytorch_model.bin
tokenizer config file saved in my_awesome_wnut_model\checkpoint-213\tokenizer_config.json
Special tokens file saved in my_awesome_wnut_model\checkpoint-213\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClass

TrainOutput(global_step=426, training_loss=0.10359365615486539, metrics={'train_runtime': 18.7858, 'train_samples_per_second': 361.336, 'train_steps_per_second': 22.677, 'total_flos': 92090981263080.0, 'train_loss': 0.10359365615486539, 'epoch': 2.0})

In [None]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
classifier(text)

In [47]:
entities = ner_pipeline(sample_question, aggregation_strategy="simple")
for entity in entities:
    print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.2f})")

Batman: ORG (0.92)


## Getting Relation

In [32]:

# a naive way for matching entities and relations

question_pattern = "who is the (.*) of ENTITY"

print("question pattern: {}\n".format(question_pattern))

question = re.sub(entity, "ENTITY", question.rstrip("?"))  # preprocess the question

relation = re.match(question_pattern, question).group(1)  # match the relation using a pattern

print("recognized relation: {}\n".format(relation))

question pattern: who is the (.*) of ENTITY



NameError: name 'entity' is not defined

## Node Matching


In [None]:
nodes = {}
predicates = {}

for node in g.all_nodes():
    if isinstance(node, URIRef):
        if g.value(node, n.label):
            nodes[node.toPython()] = g.value(node, n.label).toPython()
        else:
            nodes[node.toPython()] = re.sub("http://example.org/", "", node.toPython())

for s, p, o in g:
    predicates[p.toPython()] = re.sub("http://example.org/", "", p.toPython())

print("labeled nodes: {}\n".format(nodes))
print("predicates: {}\n".format(predicates))

In [None]:

tmp = 9999
match_node = ""
print("--- entity matching for \"{}\"\n".format(entity))
for key, value in nodes.items():
    print("edit distance between {} and {}: {}".format(value, entity, editdistance.eval(value, entity)))
    if editdistance.eval(value, entity) < tmp:
        tmp = editdistance.eval(value, entity)
        match_node = key

tmp = 9999
match_pred = ""
print("\n--- relation matching for \"{}\"\n".format(relation))
for key, value in predicates.items():
    print("edit distance between {} and {}: {}".format(value, relation, editdistance.eval(value, relation)))
    if editdistance.eval(value, relation) < tmp:
        tmp = editdistance.eval(value, relation)
        match_pred = key

print("\n--- the matching node of \"{}\" is {}\n".format(entity, match_node))
print("--- the matching predicates of \"{}\" is {}\n".format(relation, match_pred))

# Query

## Query Generation

In [None]:
query_template = "SELECT DISTINCT ?x ?y WHERE {{ ?x <{}> <{}>. ?x <{}> ?y. }}".format(match_pred, match_node, n.label)

print("--- sparql query: {}".format(query_template))

qres = g.query(query_template)

print("\n--- querying results: ")
for row in qres:
    print(row.x, row.y)
    answer = row.y

## Answer Sentence Generation

In [5]:
images_json = pd.read_json(os.path.join('..','dataset/movienet/images.json')) 

In [6]:
images_json.head()

Unnamed: 0,w,movie,img,h,type,cast
0,1666,[tt4882376],0315/rm601699072.jpg,1000,behind_the_scenes,"[nm8801745, nm0001401]"
1,999,[tt2318625],2538/rm814292736.jpg,562,still_frame,[nm2072214]
2,1500,[tt4003966],0354/rm2068192512.jpg,1000,still_frame,[nm0268626]
3,1333,[],3777/rm27402752.jpg,1000,publicity,[nm6655379]
4,704,[],3459/rm537652736.jpg,1000,event,"[nm1577190, nm7097953]"
