In [17]:
import nltk

## Tokenization

##### Word Tokenization

In [18]:
from nltk.tokenize import word_tokenize
text = "Hello, how are you?"
tokens = word_tokenize(text)
print(tokens)

['Hello', ',', 'how', 'are', 'you', '?']


#### Sentence tokenization

In [19]:
from nltk.tokenize import sent_tokenize
text = "Hello there. How are you doing? I hope you're well."
sentences = sent_tokenize(text)
print(sentences)

['Hello there.', 'How are you doing?', "I hope you're well."]


#### Regexp Tokenizer

In [20]:
from nltk.tokenize import regexp_tokenize
text = "Email me at test@example.com"
tokens = regexp_tokenize(text,pattern=r'\S+') # One or more non-white spaces
print(tokens)

['Email', 'me', 'at', 'test@example.com']


## Stop Words

Stopwords are common words that carry little meaningful information, such as: "is", "the", "and", "in", "to" etc. They're often removed in text preprocessing.

#### Import and Download StopWords

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Get Stopwords List

In [22]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'hers', 'mightn', 'shan', 'than', 'nor', 'you', 'ourselves', "couldn't", 'did', 'above', 'why', 'other', 'because', 'once', 've', 'it', 'there', "don't", "i'm", 'between', 'its', 'yourself', 'more', 'does', 'or', 'that', 'itself', 'an', 'ma', "i'll", 'him', 'how', 'they', "didn't", 'those', 'can', 'on', "that'll", 'wouldn', 'own', "she's", 'he', 'his', "we'd", "you'd", 'needn', 'them', 'over', 'wasn', "i've", "we've", 'don', "wouldn't", 'were', 'was', 'being', 'out', 'through', 'while', 'only', 'herself', 'we', 'and', "he'll", 'doesn', 'having', 'but', 'your', 'won', 'didn', 'by', "he'd", "should've", 'whom', 'when', 'some', 'too', 'my', 'very', "i'd", 'has', 'both', 'be', "they're", "you're", "hasn't", 'this', 'any', 'again', 'doing', 'below', 'a', 'are', "shan't", 'of', "hadn't", "they'd", 'haven', 'about', 'yourselves', 'weren', "aren't", "you'll", "won't", 'before', 'i', "mightn't", 'theirs', "it'd", 'our', 'she', 'couldn', 'no', 'hasn', "we're", 'for', 'such', "shouldn't", 'll', 

#### Remove Stopwords from a Text

In [23]:
text = "This is an example showing off stop word filtering."
words = word_tokenize(text)
filtered = [word for word in words if word.lower() not in stop_words]
print(filtered)

['example', 'showing', 'stop', 'word', 'filtering', '.']


## Stemming

Stemming is the process of reducing a word to its base or root form (stem), usually by chopping off suffixes.

#### PorterStemmer

In [24]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["playing","played","player","plays"]
stems = [stemmer.stem(word) for word in words]
print(stems)

['play', 'play', 'player', 'play']


#### SnowballStemmer

In [25]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer("english")
words = ["running","ran","runs","runner"]
stems = [snowball.stem(word) for word in words]
print(stems)

['run', 'ran', 'run', 'runner']


#### Stemming with regex

In [26]:
from nltk.stem import RegexpStemmer

# Strip common suffixes like -ing, -ed , -s
stemmer = RegexpStemmer(r'(ing|ed|s)$')
words = ['playing', 'played', 'plays', 'player', 'jumps']
stems = [stemmer.stem(word) for word in words]
print(stems)

['play', 'play', 'play', 'player', 'jump']


## Lemmatizer

Lemmatization reduces a word to its base form (lemma), but uses vocabulary and grammar rules, so the result is a real word.

In [27]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))    # → 'running' (default POS is noun)
print(lemmatizer.lemmatize("running", pos="v"))  # → 'run' (verb base form of running is run)
print(lemmatizer.lemmatize("better", pos="a"))   # → 'good'
print(lemmatizer.lemmatize("better",pos="r")) # -> here better is used as an adverb 'r' is for adverb

running
run
good
well


## POS Tagging

In [28]:
from nltk import pos_tag

text = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text)
tags = pos_tag(tokens)
print(tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## Named Entity Recognization

In [29]:
from nltk import ne_chunk
text = "Barack Obama was born in Hawaii and became the President of the United States."
# Step 1 : Tokenize
tokens = word_tokenize(text)

# Step 2 : POS Tagging 
pos_tags = pos_tag(tokens)

# Step 3 : Named Entity Chunking
named_entities = ne_chunk(pos_tags)

print(named_entities)

# Show GUI tree
named_entities.draw()

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  and/CC
  became/VBD
  the/DT
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


## Spelling Correcting

In [30]:
from textblob import TextBlob
incorrect_text = "ceertain conditionas duriing severl ggeneratoins aree moodified in the saame maner"
textBlb = TextBlob(incorrect_text)
textBlb.correct()

TextBlob("certain conditions during several generations are modified in the same manner")