In [2]:
### TOKENIZATION

In [3]:
import nltk
from nltk import regexp_tokenize

text = 'That U.S.A. poster-print costs $12.40...'

pattern = '[a-zA-Z0-9_]+'
tokens = regexp_tokenize(text, pattern)
print(len(tokens))
print(tokens)

9
['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']


In [9]:
pattern = r'''(?x)           # set flag to allow verbose regexps
        (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*       # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
        | \.\.\.             # ellipsis
        | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
        '''

tokens = regexp_tokenize(text, pattern)
print(len(tokens))
print(tokens)

6
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eduribeiro/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk import word_tokenize

text = 'That U.S.A. poster-print costs $12.40...'
tokens = word_tokenize(text)

print(len(tokens))
print(tokens)

7
['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', '...']


In [11]:
from urllib import request

url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

print(len(raw))
print(raw[:75])

tokens = word_tokenize(raw)

print(len(tokens))
print(tokens[2])

1176967
﻿The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky
257085
Gutenberg


In [15]:
import os

s = input("Enter some text: ")
tokens = word_tokenize(s)

print("You typed", len(tokens), "words: ", tokens)

Enter some text:  U.S. is awful. Let's face it.


You typed 9 words:  ['U.S.', 'is', 'awful', '.', 'Let', "'s", 'face', 'it', '.']


In [None]:
### STEMMING AND LEMMATIZATION

In [16]:
from nltk.stem import PorterStemmer

# Init the Porter Stemmer
porter = PorterStemmer()

sentence = "I am finding my way into AI, and I am really enjoying it. I find NLP to be quite enjoyable."

# Tokenize: Split the text into words
word_list = nltk.word_tokenize(sentence)

print(word_list)
print(len(set(word_list)))

['I', 'am', 'finding', 'my', 'way', 'into', 'AI', ',', 'and', 'I', 'am', 'really', 'enjoying', 'it', '.', 'I', 'find', 'NLP', 'to', 'be', 'quite', 'enjoyable', '.']
19


In [17]:
# Stem list of words and join
stemmed_output = ' '.join([porter.stem(w) for w in word_list])

print(stemmed_output)
stemmed_word_list = nltk.word_tokenize(stemmed_output)
print(len(set(stemmed_word_list)))

I am find my way into AI , and I am realli enjoy it . I find nlp to be quit enjoy .
17


In [19]:
# Portuguese stemmer
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()
sentence = "Estou mesmo a gostar desta unidade curricular de IA."
word_list = nltk.word_tokenize(sentence)

print(word_list)

stemmed_output = ' '.join([stemmer.stem(w) for w in word_list])

print(stemmed_output)

[nltk_data] Downloading package rslp to /Users/eduribeiro/nltk_data...


['Estou', 'mesmo', 'a', 'gostar', 'desta', 'unidade', 'curricular', 'de', 'IA', '.']
est mesm a gost dest unidad curricul de ia .


[nltk_data]   Unzipping stemmers/rslp.zip.


In [None]:
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

sentence = "I am really enjoying these AI classes."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)

print(word_list)

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])

print(lemmatized_output)

In [None]:
### SENTENCE SEGMENTATION

In [None]:
from nltk.tokenize import sent_tokenize

text = "Hello. Are you Mr. Smith? Just to let you know that I have finished my M.Sc. and Ph.D. on AI. I loved it!"
sentences = sent_tokenize(text)

print(sentences)
print(len(sent_tokenize(text)))