Dependencies

In [None]:
!pip install nltk &> /dev/null
!pip uninstall numpy --y &> /dev/null
!python -m pip install numpy==1.14.5 &> /dev/null
!pip install spacy==2.1.0 &> /dev/null
!pip install folium==0.2.1 &> /dev/null
!pip install neuralcoref --no-binary neuralcoref &> /dev/null
!python -m spacy download en &> /dev/null

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Loading file (mount notebook in the directory where the file is)

In [None]:
# Filename
fn='HP-Wikipedia.txt'


def loadText (filename=fn):
    with open(filename) as f:
        text = f.read()
    f.close()
    return text


text = loadText()
print(text)

Coreference resolution

In [None]:
import spacy
import neuralcoref

# Load SpaCy
nlp = spacy.load('en')

# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

#Function that executes coreference resolution on a given text
def coref_resolution(text):
    
    doc = nlp(text) 

    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)

    for cluster in doc._.coref_clusters:
      
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  
                
                # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. 
                    # This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""

    return "".join(tok_list)

# Getting coreferenced text
coeref_text = coref_resolution(text)
print(coeref_text)

Wikifier

In [9]:
import urllib
from string import punctuation
import nltk
import json

# Function that fetches entity linking results from wikifier.com API
def wikifier(text, lang="en", threshold=0.8):
    uris_is=[]
    URI="http://group6.com/"
    
    # Prepare the URL for API call
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "tgbdmkpmkluegqfbawcwjywieevmza"),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"

    # Call the Wikifier and read the response
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))

    
    results = list()
    subjects_found = set()

    # Output the results
    print(response["annotations"])

    # Scroll through the results building "isA" triples and associated URIs
    for annotation in response["annotations"]:
        if ('wikiDataClasses' in annotation):

          text_reference = ""
          text_reference_new = ""
          for el in annotation["support"]:

            # Getting text reference by looking at starting character and ending character of each reference found in the text
            # The longest reference (highest number of charachter) is taken as final reference
            ch_start = el['chFrom']
            ch_stop = el['chTo']
            if ch_start < len(coeref_text) and ch_stop < len(coeref_text):
              text_reference_new = coeref_text[ch_start:ch_stop+1]
              if text_reference_new[-2] == '\'':
                text_reference_new = text_reference_new[0:-2]

              if len(text_reference_new) > len(text_reference):
                text_reference = text_reference_new
          
          # If the text reference and the title of the annotation doesn't match, another "is a" triple 
          # (and an associated URI) is added with the right annotation name 
          if text_reference != annotation['title']:
            results.append(str(text_reference + " is a " + annotation['title']))
            uris_is.append(["<"+URI+"subject#"+text_reference.replace(" ","_")+">", "<"+URI+"pred#is", "<"+URI+"obj#"+annotation['title'].replace(" ","_")+">"])
          else:
            text_reference = annotation['title']
          
          # (ex. if we have "Professor Snape" in the text and the annotation is "Severus Snape", it's build the triple "Professor Snape is a Severus Snape")
          
          subjects_found.add(text_reference)
          subjects_found.add(annotation['title'])

          # Saving "isA" triples and URIs
          for label in annotation['wikiDataClasses']:
            results.append(str(text_reference + " is a " + label['enLabel']))
            uris_is.append(["<"+URI+"subject#"+text_reference.replace(" ","_")+">", "<"+URI+"pred#is", "<"+URI+"obj#"+label['enLabel'].replace(" ","_")+">"])

            
    # Returning "is a" triples, URIs and subjects (text reference with matching annotations on Wikipedia)
    return results, uris_is, subjects_found

In [10]:
# Getting "is a" Triples, associated URIs and subjects
results, uris_is, subjects = wikifier(coeref_text)

[{'title': 'Protagonist', 'url': 'http://en.wikipedia.org/wiki/Protagonist', 'lang': 'en', 'pageRank': 0.001148130838569099, 'secLang': 'en', 'secTitle': 'Protagonist', 'secUrl': 'http://en.wikipedia.org/wiki/Protagonist', 'wikiDataItemId': 'Q215972', 'wikiDataClasses': [{'itemId': 'Q636497', 'enLabel': 'character type'}, {'itemId': 'Q4070702', 'enLabel': 'literary archetype'}, {'itemId': 'Q4897819', 'enLabel': 'role'}, {'itemId': 'Q131714', 'enLabel': 'archetype'}, {'itemId': 'Q35120', 'enLabel': 'entity'}, {'itemId': 'Q1207505', 'enLabel': 'quality'}, {'itemId': 'Q21146257', 'enLabel': 'type'}, {'itemId': 'Q937228', 'enLabel': 'property'}, {'itemId': 'Q16889133', 'enLabel': 'class'}, {'itemId': 'Q5127848', 'enLabel': 'class'}, {'itemId': 'Q7184903', 'enLabel': 'abstract object'}, {'itemId': 'Q488383', 'enLabel': 'object'}], 'dbPediaTypes': [], 'dbPediaIri': 'http://dbpedia.org/resource/Protagonist', 'supportLen': 6, 'support': [{'wFrom': 1, 'wTo': 2, 'chFrom': 4, 'chTo': 20, 'pMentio

Triples and URIs obtained from wikifier

In [11]:
for x in results:
  print(x)

for y in uris_is:
  print(y)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Surrey is a spatio-temporal entity
Cedric Diggory is a List of supporting Harry Potter characters
Cedric Diggory is a Wikimedia list of fictional characters
Cedric Diggory is a Wikimedia list article
Cedric Diggory is a list
Cedric Diggory is a Wikimedia internal item
Cedric Diggory is a MediaWiki main-namespace page
Cedric Diggory is a collection
Cedric Diggory is a information
Cedric Diggory is a MediaWiki page
Cedric Diggory is a object
Cedric Diggory is a group
Cedric Diggory is a abstract object
Cedric Diggory is a web page
Cedric Diggory is a entity
Cedric Diggory is a electronic page
Cedric Diggory is a document
Cedric Diggory is a electronic media
Cedric Diggory is a content
Cedric Diggory is a work
Cedric Diggory is a communication medium
Cedric Diggory is a text
Cedric Diggory is a written work
Cedric Diggory is a information resource
Cedric Diggory is a content
Cedric Diggory is a product
Cedric Diggory is a art

Obtaining the main subjects and subsequent filtering

In [12]:
print("\nSubjects\n")
for subject in subjects:
  print(subject)


# Getting subjects composed by more words
print("\More Words Subjects\n")
subject_more_words = set()
for subject in subjects:
  if ' ' in subject or '-' in subject or '.' in subject or '\'' in subject:
    subject_more_words.add(subject)


for subject in subject_more_words:
  print(subject)


Subjects

Defence Against the Dark Arts
alchemy
List of supporting Harry Potter characters
wizarding world
Order of the Phoenix (fictional organisation)
mythology
Alastor "Mad-Eye" Moody
Philosopher's stone
Ginny Weasley
Surrey
spells
Folklore
Philosopher’s Stone
Wand
World War II
Quidditch
Fantasy
Sirius Black
Harry Potter and the Order of the Phoenix
Harry Potter and the Philosopher's Stone
Harry Potter and the Half-Blood Prince
Wizarding World
Cedric Diggory
Fred Weasley
Philosopher's Stone
The Lord of the Rings
Magic (supernatural)
Rubeus Hagrid
Human
Dumbledore's Army
Harry Potter and the Deathly Hallows
Lord Voldemort
Professor Snape
Alchemy
Harry Potter books
Magic in Harry Potter
Witchcraft
Magical creatures in Harry Potter
Magical objects in Harry Potter
Tom Marvolo Riddle
Barty Crouch, Jr.
Hogwarts staff
Harry Potter and the Chamber of Secrets
Harry Potter and the Goblet of Fire
Fictional universe of Harry Potter
Places in Harry Potter
fantasy
Harry Potter and the Prisoner o

Sentences Tokenization

In [13]:
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize sentences
print("\nSplitting in sentences\n")
tokenized_sentences = sent_tokenize(coeref_text)
print(tokenized_sentences)

# Tokenize words by sentences
print("\nSplitting in tokens\n")
i = 0
while i < len(tokenized_sentences):
  tokenized_sentences[i] = word_tokenize(tokenized_sentences[i])
  i += 1
print(tokenized_sentences)


Splitting in sentences

['The central character in the series is Harry Potter, a boy who lives in the fictional town of Little Whinging, Surrey with The central character in the series aunt, uncle, and cousin – the Dursleys – and discovers at the age of eleven that The central character in the series is a wizard, though The central character in the series lives in the ordinary world of non-magical people known as Muggles.', 'The wizarding world exists parallel to the Muggle world, albeit hidden and in secrecy.', 'The central character in the series magical ability is inborn, and children with such abilities are invited to attend exclusive magic schools that teach the necessary skills to succeed in the wizarding world.', 'Harry becomes a student at Hogwarts School of Witchcraft and Wizardry, a wizarding academy in Scotland, and it is here where most of the events in the series take place.', "As Harry develops through Harry adolescence, Harry learns to overcome the problems that face Ha

We bring together the main subjects, previously divided by tokenization

In [14]:
# Function to merge words belonging to the same subject composed of more words, 
# that have been divided by tokenization
def merge_subjects(words):
  i = 0
  new_tokenized = words.copy()
  while(i < len(new_tokenized)):
    subject = ""
    j = 0

    while (subject not in subject_more_words) and (j < 5) and (i+j < len(new_tokenized)):
      if j == 0:
        subject += new_tokenized[i+j]
      else:
        subject += " " + new_tokenized[i+j]
      j += 1
    
    if j != 5 and i+j < len(new_tokenized):
      new_tokenized[i+j-1] = subject
      j -= 1
      while j > 0:
        del new_tokenized[i+j-1]
        j -= 1
    
    i += 1
  
  return new_tokenized

In [15]:
# Merging words for obtaining composed subjects for each sentence
i = 0
while i < len(tokenized_sentences):
  tokenized_sentences[i] = merge_subjects(tokenized_sentences[i])
  i += 1

print(tokenized_sentences)

[['The', 'central character', 'in', 'the', 'series', 'is', 'Harry Potter', ',', 'a', 'boy', 'who', 'lives', 'in', 'the', 'fictional', 'town', 'of', 'Little Whinging', ',', 'Surrey', 'with', 'The', 'central character', 'in', 'the', 'series', 'aunt', ',', 'uncle', ',', 'and', 'cousin', '–', 'the', 'Dursleys', '–', 'and', 'discovers', 'at', 'the', 'age', 'of', 'eleven', 'that', 'The', 'central character', 'in', 'the', 'series', 'is', 'a', 'wizard', ',', 'though', 'The', 'central character', 'in', 'the', 'series', 'lives', 'in', 'the', 'ordinary', 'world', 'of', 'non-magical', 'people', 'known', 'as', 'Muggles', '.'], ['The', 'wizarding world', 'exists', 'parallel', 'to', 'the', 'Muggle', 'world', ',', 'albeit', 'hidden', 'and', 'in', 'secrecy', '.'], ['The', 'central character', 'in', 'the', 'series', 'magical', 'ability', 'is', 'inborn', ',', 'and', 'children', 'with', 'such', 'abilities', 'are', 'invited', 'to', 'attend', 'exclusive', 'magic', 'schools', 'that', 'teach', 'the', 'necessa

Stopwords removal

In [16]:
from nltk.corpus import stopwords
import string

# Function to remove stopwords from sentence
def remove_stopwords(sentences):
  stop_words = set(stopwords.words('english'))
  not_stop_words = ('is', 'having', 'be',  'do', 'own', 'am', 'are', 'were', 'had', 'been', 'have', 'does', 'did', 'has', 'being', 'doing', 'was')
  pronouns = ['i', 'we', 'it', 'you', 'she', 'he', 'they', 'me', 'us', 'her', 'him', 'them'] 

  for word in not_stop_words:
    stop_words.remove(word)

  for word in pronouns:
      if word in stop_words:
          stop_words.remove(word)
  words_without_sw = [word for word in sentences if not word.lower() in stop_words and word.lower() not in string.punctuation and word != "–"]
  return words_without_sw

In [17]:
# Getting sentences without stopwords
sentences_without_sw = list()
i = 0
while i < len(tokenized_sentences):
  sentences_without_sw.append(remove_stopwords(tokenized_sentences[i]))
  i += 1

print(sentences_without_sw)

[['central character', 'series', 'is', 'Harry Potter', 'boy', 'lives', 'fictional', 'town', 'Little Whinging', 'Surrey', 'central character', 'series', 'aunt', 'uncle', 'cousin', 'Dursleys', 'discovers', 'age', 'eleven', 'central character', 'series', 'is', 'wizard', 'though', 'central character', 'series', 'lives', 'ordinary', 'world', 'non-magical', 'people', 'known', 'Muggles'], ['wizarding world', 'exists', 'parallel', 'Muggle', 'world', 'albeit', 'hidden', 'secrecy'], ['central character', 'series', 'magical', 'ability', 'is', 'inborn', 'children', 'abilities', 'are', 'invited', 'attend', 'exclusive', 'magic', 'schools', 'teach', 'necessary', 'skills', 'succeed', 'wizarding world'], ['Harry', 'becomes', 'student', 'Hogwarts', 'School', 'Witchcraft', 'Wizardry', 'wizarding', 'academy', 'Scotland', 'it', 'is', 'events', 'series', 'take', 'place'], ['Harry', 'develops', 'Harry', 'adolescence', 'Harry', 'learns', 'overcome', 'problems', 'face', 'Harry', 'magical', 'social', 'emotional

POS Tagging

In [18]:
from nltk import pos_tag

# Function to do POS tagging on sentence, removing words that are not 
# verbs or nouns
def tagging(words):
  pos_words = pos_tag(words)
  nouns_dict = ['NN', 'NNS', 'NNP', 'NNPS']
  verbs_dict = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  sent_clean = [(x,y) for (x,y) in pos_words if (y in nouns_dict or y in verbs_dict)]

  return sent_clean

In [19]:
# Getting POS tagged sentences
tagged_sentences = list()
i = 0
while i < len(sentences_without_sw):
  tagged_sentences.append(tagging(sentences_without_sw[i]))
  i += 1

print(tagged_sentences)

[[('central character', 'NN'), ('series', 'NN'), ('is', 'VBZ'), ('Harry Potter', 'NNP'), ('boy', 'NN'), ('lives', 'VBZ'), ('town', 'NN'), ('Little Whinging', 'NNP'), ('Surrey', 'NNP'), ('central character', 'NN'), ('series', 'NN'), ('aunt', 'NN'), ('cousin', 'NN'), ('Dursleys', 'NNP'), ('discovers', 'VBZ'), ('age', 'NN'), ('series', 'NN'), ('is', 'VBZ'), ('series', 'NN'), ('lives', 'VBZ'), ('world', 'NN'), ('people', 'NNS'), ('known', 'VBN'), ('Muggles', 'NNS')], [('wizarding world', 'NN'), ('exists', 'VBZ'), ('Muggle', 'NNP'), ('world', 'NN'), ('albeit', 'NN'), ('hidden', 'NN'), ('secrecy', 'NN')], [('central character', 'NN'), ('series', 'NN'), ('ability', 'NN'), ('is', 'VBZ'), ('children', 'NNS'), ('abilities', 'NNS'), ('are', 'VBP'), ('invited', 'VBN'), ('magic', 'NN'), ('schools', 'NNS'), ('teach', 'VBP'), ('skills', 'NNS'), ('succeed', 'VB'), ('wizarding world', 'NN')], [('Harry', 'NNP'), ('becomes', 'VBZ'), ('student', 'NN'), ('Hogwarts', 'NNP'), ('School', 'NNP'), ('Witchcraft'

Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer

# Function to lemmatize verbs, converting them in base form
def lemmatization(tagged_sentence):
  nouns_dict = ['NN', 'NNS', 'NNP', 'NNPS']
  verbs_dict = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  lemmatizer = WordNetLemmatizer()
  lemmatized_words = []
  for (x,y) in tagged_sentence:
    if y in verbs_dict:
      y = 'V'
      x = lemmatizer.lemmatize(x, 'v')
    if x in subjects:
      y = 'NNS'
    lemmatized_words.append((x,y))
  
  return lemmatized_words

In [21]:
# Getting lemmatized sentences
lemmatized_sentences = list()
i = 0
while i < len(tagged_sentences):
  lemmatized_sentences.append(lemmatization(tagged_sentences[i]))
  i += 1

print(lemmatized_sentences)

[[('central character', 'NNS'), ('series', 'NN'), ('be', 'V'), ('Harry Potter', 'NNS'), ('boy', 'NN'), ('live', 'V'), ('town', 'NN'), ('Little Whinging', 'NNS'), ('Surrey', 'NNS'), ('central character', 'NNS'), ('series', 'NN'), ('aunt', 'NN'), ('cousin', 'NN'), ('Dursleys', 'NNP'), ('discover', 'V'), ('age', 'NN'), ('series', 'NN'), ('be', 'V'), ('series', 'NN'), ('live', 'V'), ('world', 'NN'), ('people', 'NNS'), ('know', 'V'), ('Muggles', 'NNS')], [('wizarding world', 'NNS'), ('exist', 'V'), ('Muggle', 'NNS'), ('world', 'NN'), ('albeit', 'NN'), ('hidden', 'NN'), ('secrecy', 'NN')], [('central character', 'NNS'), ('series', 'NN'), ('ability', 'NN'), ('be', 'V'), ('children', 'NNS'), ('abilities', 'NNS'), ('be', 'V'), ('invite', 'V'), ('magic', 'NN'), ('schools', 'NNS'), ('teach', 'V'), ('skills', 'NNS'), ('succeed', 'V'), ('wizarding world', 'NNS')], [('Harry', 'NNP'), ('become', 'V'), ('student', 'NN'), ('Hogwarts', 'NNS'), ('School', 'NNP'), ('Witchcraft', 'NNS'), ('Wizardry', 'NNP'

Taking synset from WordNet

In [22]:
from nltk.corpus import wordnet as wn

# Getting synonyms of each verb in sentences by WordNet (the synonyms are only verbs too) 
verbs_synonyms = {}
for sentence in lemmatized_sentences:
  for (x,y) in sentence:
    if y == 'V':
      verbs_synonyms[x] =  {str(lemma.name()).split('.')[0] for lemma in wn.synsets(x, pos=wn.VERB) if str(lemma.name()).split('.')[0] != x }
print(verbs_synonyms)

{'be': {'embody', 'cost', 'equal', 'exist', 'constitute'}, 'live': {'be', 'survive', 'exist', 'populate', 'know'}, 'discover': {'fall_upon', 'unwrap', 'learn', 'identify', 'detect'}, 'know': {'acknowledge', 'sleep_together'}, 'exist': set(), 'invite': {'tempt', 'receive'}, 'teach': set(), 'succeed': set(), 'become': set(), 'wizarding': set(), 'take': {'choose', 'lead', 'drive', 'assume', 'bring', 'subscribe', 'lease', 'contain', 'necessitate', 'consider', 'carry', 'accept', 'fill', 'contract', 'claim', 'film', 'remove', 'aim', 'consume', 'learn'}, 'develop': {'explicate', 'originate', 'build_up', 'train', 'break', 'evolve', 'grow', 'modernize'}, 'learn': {'memorize', 'determine', 'teach'}, 'overcome': {'get_the_better_of', 'get_the_best', 'overwhelm'}, 'face': {'confront', 'front'}, 'include': {'admit'}, 'friendships': set(), 'schoolwork': set(), 'lie': {'lie_down', 'dwell'}, 'contain': {'check', 'control', 'incorporate', 'hold'}, 'experience': {'have', 'feel', 'know'}, 'view': {'see',

Finding Triples obtained from Lemmatization and POS Tagging

In [23]:
# Function to extrapolate "Subject-Predicate-Object" triples and associated URIs from a sentence
def find_triples(lemmatized_sentence, verbs_synonyms, URI="http://group6.com/"):
  triples = []
  uris = []
  nouns_dict = ['NN', 'NNS', 'NNP', 'NNPS']
  verbs_dict = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  i=0
  len_words = len(lemmatized_sentence)
  last_previous_token = ""
  
  while(i < len_words):
    triple = ""
    subject_flag = 0
    obj_flag = 0
    pred_flag = 0
    subject_treshold = 2
    pred_treshold = 1
    subject = ""
    pred = ""
    obj = ""

    # Building subject part
    while(i < len_words and lemmatized_sentence[i][1] in nouns_dict):
      subject += lemmatized_sentence[i][0] + " "
      i += 1
      subject_flag += 1
    
    # Removing possible duplicates
    subject = ' '.join([x for i,x in enumerate(subject.split()) if i==0 or x!=subject.split()[i-1]])

    # Adding subject to the triple
    triple += subject + " "

    # Building predicate part
    while(i < len_words and lemmatized_sentence[i][1] == 'V'):
      pred += lemmatized_sentence[i][0] + " "
      i += 1
      pred_flag += 1
    
    # Removing possible duplicates
    pred = ' '.join([x for i,x in enumerate(pred.split()) if i==0 or x!=pred.split()[i-1]])

    # Adding subject to the triple
    triple += pred + " "

    # Getting object
    if(i < len_words and lemmatized_sentence[i][1] in nouns_dict):
        obj = lemmatized_sentence[i][0]
        obj_flag += 1
    
    # Adding object to triple
    triple += obj

    # Checking if every part of the triple have been taken and checking maximum number of words in subject and predicate part
    # If there are more words then the allowed ones the triple is not valid 
    if (subject_flag > 0 and subject_flag <= subject_treshold) and (pred_flag > 0 and pred_flag <= pred_treshold) and obj_flag > 0:

      # If the check is passed the triple is saved
      triples.append(triple)

      # Building and saving URI
      uris.append(["<"+URI+"subject#"+subject.replace(" ","_")+">", "<"+URI+"pred#"+pred.replace(" ","_")+">", "<"+URI+"obj#"+obj.replace(" ","_")+">"])

      # Building new triples by using verb's synonyms obtained by WordNet
      for verb in pred.split(" "):
        if verb in verbs_synonyms:
          for synonym in verbs_synonyms[verb]:

            # Changing predicate part
            syn_pred = pred.replace(verb, synonym)
            
            # Building new triple
            new_triple = subject + " " + syn_pred + " " + obj
            
            # Saving new triple
            triples.append(new_triple)
            
            # Building and saving new URI
            uris.append(["<"+URI+"subject#"+subject.replace(" ","_")+">", "<"+URI+"pred#"+syn_pred.replace(" ","_")+">", "<"+URI+"obj#"+obj.replace(" ","_")+">"])
  
  # Returning sentence lists of "Subject-Predicate-Object" triples and associated URIs
  return triples, uris

In [24]:
triples_total = list()
uris_total = list()
i = 0

# Getting "Subject-Predicate-Object" triples and associated URIs by text sentences
while i < len(lemmatized_sentences):
  triple, uri = find_triples(lemmatized_sentences[i], verbs_synonyms)
  triples_total.append(triple)
  uris_total.append(uri)
  i += 1
triples = [ triple for listOfTriple in triples_total for triple in listOfTriple ]
uris = [ uri for listOfUri in uris_total for uri in listOfUri ]


Triples and URIs manually obtained from text


In [25]:
i = 0
for i in range(0, len(triples)):
  print(triples[i])
  print(uris[i])
  print("\n")

central character series be Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6.com/pred#be>', '<http://group6.com/obj#Harry_Potter>']


central character series embody Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6.com/pred#embody>', '<http://group6.com/obj#Harry_Potter>']


central character series cost Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6.com/pred#cost>', '<http://group6.com/obj#Harry_Potter>']


central character series equal Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6.com/pred#equal>', '<http://group6.com/obj#Harry_Potter>']


central character series exist Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6.com/pred#exist>', '<http://group6.com/obj#Harry_Potter>']


central character series constitute Harry Potter
['<http://group6.com/subject#central_character_series>', '<http://group6

In [26]:
# Merging "is A" and "Subject-Predicate-Object" Triples and URIs
final_triples=results+triples
final_uris=uris_is+uris


PRINT=len(final_triples) == len(final_uris)

if PRINT:
  for i in range(0, len(triples)):
    print(final_triples[i])
    print(final_uris[i])
    print("\n")

central character is a Protagonist
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#Protagonist>']


central character is a character type
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#character_type>']


central character is a literary archetype
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#literary_archetype>']


central character is a role
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#role>']


central character is a archetype
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#archetype>']


central character is a entity
['<http://group6.com/subject#central_character>', '<http://group6.com/pred#is', '<http://group6.com/obj#entity>']


central character is a quality
['<http://group6.com/subject#central_char

Saving Triples and URIs


In [28]:
import csv

with open ('uris.csv', 'w') as f:
  write = csv.writer(f)
  write.writerows(final_uris)


textfile = open("triples.txt", "w")
for triple in final_triples:
    textfile.write(triple + "\n")
textfile.close()
