## tokenization with nltk and spacy

In [7]:
import pandas as pd
import re
import nltk
import spacy

In [6]:
import re
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip().lower()

In [36]:
sample_text = "Barak Obama was born in Hawaii"

In [15]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

sentence_nltk= sent_tokenize(sample_text)
words_nltk=word_tokenize(sample_text)
print(sentence_nltk)
print(words_nltk)

['This handy tool helps you create dummy text for all your layout needs.', 'We are gradually adding new functionality and we welcome your suggestions and feedback.']
['This', 'handy', 'tool', 'helps', 'you', 'create', 'dummy', 'text', 'for', 'all', 'your', 'layout', 'needs', '.', 'We', 'are', 'gradually', 'adding', 'new', 'functionality', 'and', 'we', 'welcome', 'your', 'suggestions', 'and', 'feedback', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_text)


sentences = [sent.text for sent in doc.sents]
tokens = [token.text for token in doc]

print(f"Spacy Sentences: {sentences}")
print(f"Spacy Words: {tokens}")

Spacy Sentences: ['This handy tool helps you create dummy text for all your layout needs.', 'We are gradually adding new functionality and we welcome your suggestions and feedback.']
Spacy Words: ['This', 'handy', 'tool', 'helps', 'you', 'create', 'dummy', 'text', 'for', 'all', 'your', 'layout', 'needs', '.', 'We', 'are', 'gradually', 'adding', 'new', 'functionality', 'and', 'we', 'welcome', 'your', 'suggestions', 'and', 'feedback', '.']


## preparation

In [37]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
filtered_nltk = [word for word in words_nltk if word.lower() not in stopwords and word.isalpha]
print(filtered_nltk)

['handy', 'tool', 'helps', 'create', 'dummy', 'text', 'layout', 'needs', '.', 'gradually', 'adding', 'new', 'functionality', 'welcome', 'suggestions', 'feedback', '.']


In [20]:
filtered_spacy = [token.text for token in doc if token.text.lower() not in stopwords and token.is_alpha]
print(filtered_spacy)

['handy', 'tool', 'helps', 'create', 'dummy', 'text', 'layout', 'needs', 'gradually', 'adding', 'new', 'functionality', 'welcome', 'suggestions', 'feedback']


## 3

In [24]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
pos_tags_nltk=nltk.pos_tag(filtered_nltk)
print(pos_tags_nltk)

[('handy', 'JJ'), ('tool', 'NN'), ('helps', 'VBZ'), ('create', 'VB'), ('dummy', 'JJ'), ('text', 'NN'), ('layout', 'NN'), ('needs', 'NNS'), ('.', '.'), ('gradually', 'RB'), ('adding', 'VBG'), ('new', 'JJ'), ('functionality', 'NN'), ('welcome', 'JJ'), ('suggestions', 'NNS'), ('feedback', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [25]:
for token in doc[:20]:
  print(token.text, token.pos_)

This DET
handy ADJ
tool NOUN
helps VERB
you PRON
create VERB
dummy ADJ
text NOUN
for ADP
all PRON
your PRON
layout ADJ
needs NOUN
. PUNCT
We PRON
are AUX
gradually ADV
adding VERB
new ADJ
functionality NOUN


##4

In [38]:
import nltk
nltk.download('maxent_ne_chunker')
sentence = "Barak Obama was born in Hawaii"
tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
chunk=nltk.ne_chunk(pos_tags)

for subtree in chunk:
  if hasattr(subtree, 'label'):
    print(subtree, subtree.label())

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


(PERSON Barak/NNP) PERSON
(PERSON Obama/NNP) PERSON
(GPE Hawaii/NNP) GPE


In [40]:
doc = nlp(sentence)

In [41]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Barak Obama PERSON
Hawaii GPE
