In [2]:
from dotenv import load_dotenv
import os

In [41]:
import pandas as pd
import nltk

In [4]:
load_dotenv()
path = os.getenv("GLOBAL_PATH")

In [None]:
corpus = pd.read_csv(f'{path}bbc-text.csv', encoding='utf-8')
text = corpus['text'].tolist()
sentence = text[0] # first sentence


In [19]:
ne_tree = nltk.pos_tag(nltk.word_tokenize(sentence))
print(ne_tree)

[('John', 'NNP'), ('Smith', 'NNP'), ('works', 'VBZ'), ('at', 'IN'), ('Google', 'NNP'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('.', '.')]


In [18]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(sentence)
sent

[('John', 'NNP'),
 ('Smith', 'NNP'),
 ('works', 'VBZ'),
 ('at', 'IN'),
 ('Google', 'NNP'),
 ('in', 'IN'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('.', '.')]

In [20]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  John/NNP
  Smith/NNP
  works/VBZ
  at/IN
  Google/NNP
  in/IN
  New/NNP
  York/NNP
  ./.)


In [21]:
NPChunker = nltk.RegexpParser(pattern)
result = NPChunker.parse(sent)
result.draw()

In [22]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('John', 'NNP', 'O'),
 ('Smith', 'NNP', 'O'),
 ('works', 'VBZ', 'O'),
 ('at', 'IN', 'O'),
 ('Google', 'NNP', 'O'),
 ('in', 'IN', 'O'),
 ('New', 'NNP', 'O'),
 ('York', 'NNP', 'O'),
 ('.', '.', 'O')]


In [23]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [24]:
doc = nlp(sentence)
pprint([(X.text, X.label_) for X in doc.ents])

[('John Smith', 'PERSON'), ('Google', 'ORG'), ('New York', 'GPE')]


In [25]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(John, 'B', 'PERSON'),
 (Smith, 'I', 'PERSON'),
 (works, 'O', ''),
 (at, 'O', ''),
 (Google, 'B', 'ORG'),
 (in, 'O', ''),
 (New, 'B', 'GPE'),
 (York, 'I', 'GPE'),
 (., 'O', '')]


In [34]:
displacy.render(nlp(str(sentence)), jupyter=True, style='ent')

In [35]:
displacy.render(nlp(str(sentence)), style='dep', jupyter = True, options = {'distance': 120})

In [36]:
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentence))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('John', 'PROPN', 'John'),
 ('Smith', 'PROPN', 'Smith'),
 ('works', 'VERB', 'work'),
 ('Google', 'PROPN', 'Google'),
 ('New', 'PROPN', 'New'),
 ('York', 'PROPN', 'York')]

In [38]:
dict([(str(x), x.label_) for x in nlp(str(sentence)).ents])

{'John Smith': 'PERSON', 'Google': 'ORG', 'New York': 'GPE'}