In [2]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...


True

## Tokenization

In [3]:
text = "This is Andrew's text, isn't it"

In [4]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['This', 'is', "Andrew's", 'text,', "isn't", 'it']

In [6]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'Andrew', "'", 's', 'text', ',', 'isn', "'", 't', 'it']

In [8]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'Andrew', "'s", 'text', ',', 'is', "n't", 'it']

# Token Normalization
## Stemming

In [12]:
text = "feet wolves cats talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
tokens

['feet', 'wolves', 'cats', 'talked']

In [11]:
stemmer = nltk.stem.PorterStemmer()
[stemmer.stem(token) for token in tokens]

['feet', 'wolv', 'cat', 'talk']

## Lemmatization

In [14]:
lemmatizer = nltk.stem.WordNetLemmatizer()
[lemmatizer.lemmatize(token) for token in tokens]

['foot', 'wolf', 'cat', 'talked']

In [23]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("first step; lemmatization:", lemmas)
stemmer = nltk.stem.PorterStemmer()
print("Second step; Stemming:", [stemmer.stem(lemma) for lemma in lemmas])

first step; lemmatization: ['foot', 'wolf', 'cat', 'talked']
Second step; Stemming: ['foot', 'wolf', 'cat', 'talk']


In [50]:
stemmer = nltk.stem.PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print("First step; Stemming:", stems)

lemmatizer = nltk.stem.WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(stem) for stem in stems]
print("Second step; lemmatization:", lemmas)


First step; Stemming: ['feet', 'wolv', 'cat', 'talk']
Second step; lemmatization: ['foot', 'wolv', 'cat', 'talk']


___
# TF-IDF example

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [47]:
texts = ["Good movie", "not a good Movie", "did not like", 
         "i like it", "good one", "gOOd Movie", "i didn't like",
        "i did", "i did like", "Not what expectes"]

In [48]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, 
                        ngram_range=(1,2))

features = tfidf.fit_transform(texts)

In [49]:
pd.DataFrame(features.todense(), 
             columns=tfidf.get_feature_names())



Unnamed: 0,did,good,good movie,like,movie,not
0,0.0,0.532231,0.598636,0.0,0.598636,0.0
1,0.0,0.456658,0.513635,0.0,0.513635,0.513635
2,0.598636,0.0,0.0,0.532231,0.0,0.598636
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.532231,0.598636,0.0,0.598636,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0
8,0.747341,0.0,0.0,0.66444,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0
