# Information Extraction from text using the NLTK package




In [1]:
# Importing libraries to be used laters
import nltk
import re
from statistics import mode
from pyld import jsonld
import json
import unidecode

In [2]:
# Load the footballers text file 
inputfile='football_players.txt' #Location of the file

# We read in the file and split the the textfile considering spaces
buf = open(inputfile, encoding = 'utf8')
list_of_doc = buf.read().split('\n')

In [3]:
# Extra lines of space in the list_of_doc is removed leaving each players doc with just one line of space
list_of_doc = [data for data in list_of_doc if data !='']

# Task 1 
Write a function that takes each document and performs:
1) sentence segmentation 2) tokenization 3) part-of-speech tagging



In [4]:
# We take in the textlines from the list_of_doc object and tokenize the sentences. Next, I Tokenized the words of the 
# sentences. The pos_tags were attached to each of the words.

def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document) 
    sentences = [nltk.word_tokenize(sent) for sent in sentences] 
    pos_sentences = [nltk.pos_tag(sent) for sent in sentences] 
    return (pos_sentences)



Running the following code to check result for the first document (Ronaldo).

In [5]:
# Testing the code above to return the pos_tags of each of the words in the document sent of the players
first_doc = list_of_doc[0]
pos_sent = ie_preprocess(first_doc)
pos_sent

[[('Cristiano', 'NNP'),
  ('Ronaldo', 'NNP'),
  ('dos', 'NN'),
  ('Santos', 'NNP'),
  ('Aveiro', 'NNP'),
  (',', ','),
  ('ComM', 'NNP'),
  (',', ','),
  ('GOIH', 'NNP'),
  ('(', '('),
  ('born', 'VBN'),
  ('5', 'CD'),
  ('February', 'NNP'),
  ('1985', 'CD'),
  (')', ')'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('Portuguese', 'JJ'),
  ('professional', 'JJ'),
  ('footballer', 'NN'),
  ('who', 'WP'),
  ('plays', 'VBZ'),
  ('for', 'IN'),
  ('Spanish', 'JJ'),
  ('club', 'NN'),
  ('Real', 'NNP'),
  ('Madrid', 'NNP'),
  ('and', 'CC'),
  ('the', 'DT'),
  ('Portugal', 'NNP'),
  ('national', 'JJ'),
  ('team', 'NN'),
  ('.', '.')],
 [('He', 'PRP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('forward', 'NN'),
  ('and', 'CC'),
  ('serves', 'NNS'),
  ('as', 'IN'),
  ('captain', 'NN'),
  ('for', 'IN'),
  ('Portugal', 'NNP'),
  ('.', '.')],
 [('In', 'IN'),
  ('2008', 'CD'),
  (',', ','),
  ('he', 'PRP'),
  ('won', 'VBD'),
  ('his', 'PRP$'),
  ('first', 'JJ'),
  ('Ballon', 'NNP'),
  ("d'Or", 'NN'),
  ('and', 'CC')

Expected output
 [...[('He', 'PRP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('forward', 'NN'),
  ('and', 'CC'),
  ('serves', 'NNS'),
  ('as', 'IN'),
  ('captain', 'NN'),
  ('for', 'IN'),
  ('Portugal', 'NNP'),
  ('.', '.')], ...]

# Task 2
Write a function that will take the list of tokens with POS tags for each sentence and returns the named entities (NE). 


In [6]:
# Taking in input of the pos_tags of the document sent. Forming the list of named_entities. The chunk function has binary
# option Put to true since only named entities are extracted. For all the subtrees of the postags chunks, I checked for the
# label NE(named entity) and printing each of leaves of the subtrees

def named_entity_finding(pos_sent):
    tree = nltk.ne_chunk(pos_sent, binary=True)
    named_entities = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NE':
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + leaf[0] + " "
            named_entities.append(entity.strip())
    return named_entities


named_entity_finding(pos_sent[0])

['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

Expected output ['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

# Task 3
Now use the named_entity_finding() function to extract all NEs for each document.


In [7]:
# We now use the named_entity function and get its contents into a list, which we then flatten or unlist into a single list

def NE_flat_list_fn(pos_sents): 
    NE=[]
    for pos_sent in pos_sents:
        NE.append(named_entity_finding(pos_sent))
        NE_flat_list = [item for sublist in NE for item in sublist]
        #Single line code here. Call the funtion named_entity_finding(pos_sent) and 
                #append the result to the NE list
    #Single line code here. Flatten the list of lists to the single list NE_flat_list
    return NE_flat_list




In [8]:
NE_flat_list_fn(pos_sent)

['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal',
 'Portugal',
 'Ballon',
 'FIFA',
 'FIFA Ballon',
 'Ronaldo',
 'Ronaldo',
 'Portuguese',
 'Portuguese Football Federation',
 'European Golden Shoe',
 'ESPN',
 'Ronaldo',
 'Manchester United',
 'England',
 'United',
 'UEFA Champions League',
 'FIFA Club',
 'Ballon',
 'FIFA',
 'Manchester United',
 'Madrid',
 'Spain',
 'Ronaldo',
 'UEFA Champions League',
 'Ronaldo',
 'La Liga',
 'Ronaldo',
 'UEFA Champions League',
 'Real Madrid',
 'La Liga',
 'Lionel Messi',
 'Ronaldo',
 'Portugal',
 'Portugal',
 'European',
 'FIFA World Cups',
 'Portuguese',
 'Portugal',
 'Portugal',
 'Portugal',
 'Ronaldo',
 'UEFA European',
 'European',
 'Michel Platini',
 'Ronaldo',
 'Portugal',
 'France',
 'Silver Boot']

# Task 4 

Write functions to extract the name of the player, country of origin and date of birth as well as the following relations: team(s) of the player and position(s) of the player.


In [9]:
# There are 2 ways I found to extract name of the player. One is defining the position of born and extracting all charcaters 
# before it since the name of all players in all the documents occurs till the born word.
## Another way is to use chunking function with binary option false and extracting all leaves of the subtrees whose label()
## is person and the post_tag is NNP.

def name_of_the_player(doc):
    name = doc[0:doc.index("(born")]
    return name

def name_player_second_way(doc):
    sentt = nltk.ne_chunk(doc, binary = False)
    named_entities = []
    for subtree in sentt.subtrees():
        if subtree.label() == 'PERSON':
            entity = ""
            for leaf in subtree.leaves():
                print(leaf)
                entity = entity + leaf[0] + " "
            named_entities.append(entity.strip())
    return named_entities


# In country of origin, there are 2 ways to do it that I found. Firstly, using functions find() and index(); I first look 
# for the occurrence of word professional and extract the country of the player from the first occurence of "is a" to the 
# word "footballer". Since in some documnets the word professional exists and if some it doesn't; if the word occurs; I've 
# subtracted its length otherwise not.
def country_of_origin(doc):
    exists_professional = doc.find("professional")
    if(exists_professional > 0):
        country = doc[doc.find("is a") + 5 : doc.index("footballer") - len("  professional")]
    else:
        country = doc[doc.find("is a") + 5 : doc.index("footballer")]
    return country

# In this method, I'm looking for the occurrence of label GPE in the subtree labels after chunking of the pos_tags 
# given to the document. The label of the subtree has to be GPE and the second leaf in each subtree must have postag of NNP
def country_second_way(doc):
    sentt = nltk.ne_chunk(doc, binary = False)
#     print(sentt)
    country = []
    for subtree in sentt.subtrees():
#         print (subtree.leaves())
        if subtree.label() == 'GPE':
            entity = ""
            for leaf in subtree.leaves():
                if leaf[1] == 'NNP':
                    entity = entity + leaf[0] + " "
                    country.append(entity.strip())
    
    return country

# For date of birth of a person, we just need to look for the position of the word born and the index of the ending 
# bracket since all date of birth in this case are in brackets
def date_of_birth(doc):
    date = doc[doc.find("born") + len("born ") : doc.index(")")]
    return date

# In team of the player, again 2 ways of doing this. One is by using regex to find position of national team and 
# extracting the word before it
def team_of_the_player(doc):
    team = re.findall(r'(?i)((?:\w+\s)?{0}(?:\s\w+)?)'.format('national team'), doc)
    return team

# In the second method, I am chunking the documents and finding if the subtree label is organization and the pos tag of the
# leaf of each subtree label is NNP
def team_another_way(doc):
    sentt = nltk.ne_chunk(doc, binary = False)
    team = []
    for subtree in sentt.subtrees():
        if subtree.label() == 'ORGANIZATION' and len(subtree) == 1:
            entity = ""
            for leaf in subtree.leaves():
                if leaf[1] == 'NNP':
                    entity = entity + leaf[0] + " "
                    team.append(entity.strip())
        elif subtree.label() == 'ORGANIZATION' and len(subtree) > 1:
            team.append(subtree[0][0] + " " + subtree[1][0])            
    
    return team


# For position, I defined a regex where I'm comparing each documnets words to the positions a footballer can occupy in 
# context of the documnet given. If any of the terms match, it is returned and if the same term is mentioned more than once, 
# a set is taken to remove duplicates; converted to a list which is converted to string for jsonld output
def position_of_the_player(doc):
    position = re.findall("forward|\w* midfielder|captain|striker|\w* winger|outfield", doc)
    pos1= list(set(position))
    pos1 = ','.join(pos1)
    return pos1


## Testing out the functions above


In [10]:
# Checking name of the player
name_of_the_player(list_of_doc[0])

'Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH '

In [11]:
# Checking name of the player by the second method
name_player_second_way(ie_preprocess(list_of_doc[1])[0])

('Lionel', 'NNP')
('Andrés', 'NNP')


['Lionel', 'Andrés']

In [12]:
# checking for country of player
country_of_origin(list_of_doc[1])

' Argentine'

In [13]:
# Checking for country of the player via the second method
country_second_way(ie_preprocess(list_of_doc[0])[0])

['Portugal']

In [14]:
# Finding team of player
team_of_the_player(list_of_doc[0])

['Portugal national team']

In [15]:
# Team of the player using pos tags and chunking
team_another_way(ie_preprocess(list_of_doc[1])[0])

['FC Barcelona']

In [16]:
# getting player position
position_of_the_player(list_of_doc[6])
# list_of_doc[4]

'captain,outfield,right winger'

Execute the below command to check your fuction


In [17]:
date_of_birth(list_of_doc[2]) 

'5 February 1992'

Expected output '5 February 1992'

# Task 5 

Write a function using the outputs from the previous functions to generate JSON-LD output as follows.

Reference: https://json-ld.org/primer/latest/

{ "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",

    "name": "",
    "born": "",
    "country": "",
    "position": [
        { "@id": "http://my-soccer-ontology.com/position",
            "type": ""
        }
     ]   
     "team": [
        { "@id": "http://my-soccer-ontology.com/team",
            "name": ""
        }   
     ]
}


In [18]:

# arg1=name
# arg2=country

# arg3=born

# arg4=team

# arg5=position
 

def generate_jsonld(arg1,arg2,arg3,arg4,arg5):
    #json data
    doc = {
        

        "@id": "http://my-soccer-ontology.com/footballer/",
        "http://schema.org/name": unidecode.unidecode(arg1),
        "http://schema.org/country": arg2,    
        "http://schema.org/born": arg3,
        "http://schema.org/team":{"@id":"http://schema.org/team", "@type": arg4},
        "http://schema.org/position":{"@id":"http://schema.org/position", "@type":arg5}
    }

    
    #Interpretation of doc (context)
    context = {

        "name": "http://schema.org/name",
        "country": "http://schema.org/country",
        "born": "http://schema.org/born",
        "team": {"@id": "http://schema.org/team", 
                 "@type": "@id"},
        "position":{"@id":"http://schema.org/position",
                    "@type":"@id"}

        }
    #On the basis of context compacting the document
    compacted = jsonld.compact(doc, context)
    compacted.pop("@context")
    return json.dumps(compacted, indent=2)






for i in range(0,len(list_of_doc)):

    #extracting player information like name,nationality,date of birth,team,position, and awards.
    arg1=name_of_the_player(list_of_doc[i])

    arg2=country_of_origin(list_of_doc[i])

    arg3=date_of_birth(list_of_doc[i])

    arg4=team_of_the_player(list_of_doc[i])

    arg5=position_of_the_player(list_of_doc[i])
    #printing the generated json-ld
    print(generate_jsonld(arg1,arg2,arg3,arg4,arg5))

 

 

{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "country": "Portuguese",
  "born": "5 February 1985",
  "name": "Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH ",
  "team": {
    "@id": "http://schema.org/team",
    "@type": "/Portugal national team"
  },
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/captain,forward"
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "country": " Argentine",
  "born": "24 June 1987",
  "name": "Lionel Andres \"Leo\" Messi ",
  "team": {
    "@id": "http://schema.org/team",
    "@type": "/Argentina national team"
  },
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/captain,forward"
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "country": "Brazilian",
  "born": "5 February 1992",
  "name": "Neymar da Silva Santos Junior ",
  "team": {
    "@id": "http://schema.org/team",
    "@type": [
      "/Brazil national team",
      "/his national team"
    ]
  },
  "positio

# Task 6 
Identify one other relation (besides team and player) and write a function to extract this. Also extend the JSON-LD output accordingly.

### Extract awards from the text
I wanted to extract the awards won by a footballer in his career. I am extracting the sentences which have words like awards to provide context of the sentences. having the awards won by the footballers in their career.

In the regex, I am extracting the sentences with word 'award' and ending with fullstop(.)

In [19]:
import re
def awards(doc):#this function returns a list containing (if at all) awards achieved by the player.
    list1=[]
    list2=[]
    #searches for texts containing awards
    awards_sent = re.findall(r'[^.]* award[^.]*\.', doc)
    #appends to list1 the contents of the sentence containing the words after "won"
    [list1.append(re.findall(r'won (.*?) (award|awards)',awards_sent[i])) for i in range(len(awards_sent))]
    #appends to list2 all non-null list elements
    [list2.append(list1[i][0][0]) for i in range(len(list1)) if(len(list1[i])!=0)]
    #returns the list2
    return list2

In [20]:
awards(list_of_doc[1])

["the Ballon d'Or and FIFA World Player of the Year"]

## Extend the json accordingly:

Now, we extend the json with award instances

In [21]:
def generate_jsonld(arg1,arg2,arg3,arg4,arg5,arg6):
    
    doc = {
        
        #json data
        "@id": "http://my-soccer-ontology.com/footballer/",
        "http://schema.org/name": unidecode.unidecode(arg1),
        "http://schema.org/country": arg2,    
        "http://schema.org/born": arg3,
        "http://schema.org/team":{"@id":"http://schema.org/team", "@type": arg4},
        "http://schema.org/position":{"@id":"http://schema.org/position", "@type":arg5},
        "http://schema.org/award":{"@id" : "http://schema.org/award", "@type" :arg6}
    }

    

    context = {
        #interpretation of json data
        "name": "http://schema.org/name",
        "country": "http://schema.org/country",
        "born": "http://schema.org/born",
        "team": {"@id": "http://schema.org/team", 
                 "@type": "@id"},
        "position":{"@id":"http://schema.org/position",
                    "@type":"@id"},
        "award" : {"@id": "http://schema.org/award",
                   "@type": "@id"}

        }
    #On the basis of context compacting the document
    compacted = jsonld.compact(doc, context)
    compacted.pop("@context")
    return json.dumps(compacted, indent=2)


for i in range(0,len(list_of_doc)):

    #extracting player information like name,nationality,date of birth,team,position, and awards.
    arg1=name_of_the_player(list_of_doc[i])

    arg2=country_of_origin(list_of_doc[i])

    arg3=date_of_birth(list_of_doc[i])

    arg4=team_of_the_player(list_of_doc[i])

    arg5=position_of_the_player(list_of_doc[i])
    
    arg6 = awards(list_of_doc[i])
    #printing the generated json-ld
    print(generate_jsonld(arg1,arg2,arg3,arg4,arg5,arg6))


{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "country": "Portuguese",
  "born": "5 February 1985",
  "name": "Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH ",
  "team": {
    "@id": "http://schema.org/team",
    "@type": "/Portugal national team"
  },
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/captain,forward"
  },
  "award": {
    "@id": "http://schema.org/award",
    "@type": "/his first Ballon d'Or and FIFA World Player of the Year"
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "country": " Argentine",
  "born": "24 June 1987",
  "name": "Lionel Andres \"Leo\" Messi ",
  "team": {
    "@id": "http://schema.org/team",
    "@type": "/Argentina national team"
  },
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/captain,forward"
  },
  "award": {
    "@id": "http://schema.org/award",
    "@type": "/the Ballon d'Or and FIFA World Player of the Year"
  }
}
{
  "@id": "http://my-soccer-ontology.com/footba