### 1. Named Entity Recognition (NER)
Identifies entities in text, such as names, dates, locations, and organizations.

In [1]:
# different spacy and en_core_web_sm versions may lead to different results
# this notebook is made with spaCy version 3.8.3 and en_core_web_sm version 3.8.0
# find your own version in the terminal: python -m spacy info
import spacy 
from spacy import displacy

NER = spacy.load("en_core_web_sm")  # Load SpaCy's small English model
sentence = "This course is lectured by Dr. S. Supraja, and Simon Liu at NTU, Singapore."

doc = NER(sentence)
for entity in doc.ents:
    print(f"{entity.text}[{entity.label_}]")

S. Supraja[PERSON]
Simon Liu[PERSON]
NTU[ORG]
Singapore[GPE]


In [2]:
# use the following command to explore the definition of the given POS tag, dependency label or entity type in spacy
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [3]:
displacy.render(doc, style='ent', jupyter=True)

### 2. Part-of-Speech (POS) Tagging
Assigning grammatical categories (like noun, verb, adjective, etc.) to words in a sentence.

In [4]:
# Process the text for coarse-grained POS tagging
for token in doc:
    print(f"{token.text}[{token.pos_}]")

This[DET]
course[NOUN]
is[AUX]
lectured[VERB]
by[ADP]
Dr.[PROPN]
S.[PROPN]
Supraja[PROPN]
,[PUNCT]
and[CCONJ]
Simon[PROPN]
Liu[PROPN]
at[ADP]
NTU[PROPN]
,[PUNCT]
Singapore[PROPN]
.[PUNCT]


In [5]:
# Process the text for fine-grained POS tagging
for token in doc:
    print(f"{token.text}[{token.tag_}]")

This[DT]
course[NN]
is[VBZ]
lectured[VBN]
by[IN]
Dr.[NNP]
S.[NNP]
Supraja[NNP]
,[,]
and[CC]
Simon[NNP]
Liu[NNP]
at[IN]
NTU[NNP]
,[,]
Singapore[NNP]
.[.]


In [6]:
from nltk import Tree

def tok_format(tok, coarse=False):
    if coarse:
        return "[".join([tok.orth_, tok.pos_]) + "]"
    return "[".join([tok.orth_, tok.tag_]) + "]"

def to_nltk_tree(node, coarse=False):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child, coarse=coarse) for child in node.children])
    else:
        return tok_format(node, coarse=coarse)

[to_nltk_tree(sent.root, coarse=True).pretty_print() for sent in doc.sents];
[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents];

                                     lectured[VBN]                                                                                  
    _______________________________________|_________________________________________________________                                
   |       |         |         |           |                       |                              Liu[NNP]                          
   |       |         |         |           |                       |                       __________|________                       
   |       |         |         |           |                     by[IN]                   |                 at[IN]                  
   |       |         |         |           |                       |                      |                   |                      
   |       |         |         |       course[NN]             Supraja[NNP]                |                NTU[NNP]                 
   |       |         |         |           |            __________

### 3. Dependency Parsing
Analyzing the grammatical structure of a sentence to establish relationships between words.

In [7]:
# Process the text for dependency parsing
print('{:<15} | {:<10} | {:<15} | {:<20}'.format('Token','Relation','Head','Children'))
print('-'*70)
for token in doc:
    #Print the token, dependency nature, head and all dependents of the token
    print("{:<15} | {:<10} | {:<15} | {:<20}"
          .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))

Token           | Relation   | Head            | Children            
----------------------------------------------------------------------
This            | det        | course          | []                  
course          | nsubjpass  | lectured        | [This]              
is              | auxpass    | lectured        | []                  
lectured        | ROOT       | lectured        | [course, is, by, ,, and, Liu, .]
by              | agent      | lectured        | [Supraja]           
Dr.             | compound   | Supraja         | []                  
S.              | compound   | Supraja         | []                  
Supraja         | pobj       | by              | [Dr., S.]           
,               | punct      | lectured        | []                  
and             | cc         | lectured        | []                  
Simon           | compound   | Liu             | []                  
Liu             | conj       | lectured        | [Simon, at]         
at     

In [8]:
# use displacy to render the text
displacy.render(doc, style='dep', jupyter=True, options={'distance':120})

### Practice for the week
Perform NER, POS tagging and dependency parser on the following text and observe the results. Refer to https://spacy.io/api for more information.

In [9]:
import pandas as pd
import spacy

raw_text1 = "From 1925 to 1945, Tolkien was the Rawlinson and Bosworth Professor of Anglo-Saxon and a Fellow of Pembroke College, both at the University of Oxford. He then moved within the same university to become the Merton Professor of English Language and Literature and Fellow of Merton College, and held these positions from 1945 until his retirement in 1959. Tolkien was a close friend of C. S. Lewis, a co-member of the informal literary discussion group The Inklings. He was appointed a Commander of the Order of the British Empire by Queen Elizabeth II on 28 March 1972."

raw_text2 = '''
From 1925 to 1945, Tolkien was the Rawlinson and Bosworth Professor of Anglo-Saxon and a Fellow of Pembroke College, both at the University of Oxford. 
He then moved within the same university to become the Merton Professor of English Language and Literature and Fellow of Merton College, and held these positions from 1945 until his retirement in 1959. 
Tolkien was a close friend of C. S. Lewis, a co-member of the informal literary discussion group The Inklings. 
He was appointed a Commander of the Order of the British Empire by Queen Elizabeth II on 28 March 1972.
'''

df = pd.DataFrame([raw_text1, raw_text2], columns=['text'])
print(df)

                                                text
0  From 1925 to 1945, Tolkien was the Rawlinson a...
1  \nFrom 1925 to 1945, Tolkien was the Rawlinson...


In [10]:
# load the small English Model
nlp = spacy.load('en_core_web_sm')

# lists to store tokens and tags
token = []
pos = []

# TODO: continue the codes from here
for sent in nlp.pipe(df['text']):
    pass