## Chapter 10

In [3]:
import nltk
import nltk.data
nltk.download('punkt')

from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qorud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1.1. Tokenization

In [4]:
# Text to tokenize
text = 'This is a tokenize test'

from nltk.tokenize import word_tokenize
word_tokenize(text)

['This', 'is', 'a', 'tokenize', 'test']

In [5]:
TextBlob(text).words

WordList(['This', 'is', 'a', 'tokenize', 'test'])

### 1.2. Stop words removal

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qorud\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
text = "S&P and NASDAQ are the two most popular indices in US"

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
text_tokens = word_tokenize(text)

tokens_without_sw= [word for word in text_tokens if not word in stop_words]
print(tokens_without_sw)

['S', '&', 'P', 'NASDAQ', 'two', 'popular', 'indices', 'US']


### 1.3. Stemming

In [8]:
text = "It's a Stemming testing"
parsed_text = word_tokenize(text)

# Initialize stemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Stem each word
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_text)
if word.lower() != stemmer.stem(parsed_text[i])]

[('Stemming', 'stem'), ('testing', 'test')]

In [9]:
text = "This is better"
parsed_text = word_tokenize(text)

# Initialize stemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Stem each word
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_text)
if word.lower() != stemmer.stem(parsed_text[i])]

[]

### 1.4. Lemmatization

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qorud\AppData\Roaming\nltk_data...


True

In [11]:
text = "This is better"

from nltk.stem import WordNetLemmatizer
parsed_data = word_tokenize(text)
lemmatizer = WordNetLemmatizer()

verb = [(word, lemmatizer.lemmatize(word, pos ="v")) for i, word in enumerate(parsed_data)
if word != lemmatizer.lemmatize(word, pos ="v")]
adjective = [(word, lemmatizer.lemmatize(word, pos ="a")) for i, word in enumerate(parsed_data)
if word != lemmatizer.lemmatize(word, pos ="a")]

verb + adjective

[('is', 'be'), ('better', 'good')]

### 1.5. PoS tagging

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\qorud\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [13]:
text = 'Google is looking at buying U.K. startup for $1 billion'
TextBlob(text).tags

[('Google', 'NNP'),
 ('is', 'VBZ'),
 ('looking', 'VBG'),
 ('at', 'IN'),
 ('buying', 'VBG'),
 ('U.K.', 'NNP'),
 ('startup', 'NN'),
 ('for', 'IN'),
 ('1', 'CD'),
 ('billion', 'CD')]