# Natural language preprocessing

In [19]:
import nltk

In [20]:
from nltk.corpus import brown

In [21]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [22]:
brown.categories()


['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [23]:
print(len(brown.categories()))

15


In [24]:
data = brown.sents(categories = 'adventure')

In [25]:
len(data)

4637

In [26]:
data = brown.sents(categories = 'fiction')

In [27]:
data

[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]

In [28]:
" ".join(data[1])

'Scotty did not go back to school .'

## bag of words pipeline
get the data/corpus

tokenisation,stopward removal

stemming

building a vocab

classification

## 1 Tokenisation & Stopword Removal

In [29]:
document = '''it was a very pleasent day. the weather was cool and there were light showers.
i went to the market to buty some fruits'''

sentence = "Send all the 50 document related to chapters 1,2,3,4 at prateek@cb.com"

In [30]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [31]:
sents = sent_tokenize(document)
print(sents)

['it was a very pleasent day.', 'the weather was cool and there were light showers.', 'i went to the market to buty some fruits']


In [32]:
print(len(sents))


3


In [33]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'document',
 'related',
 'to',
 'chapters',
 '1,2,3,4',
 'at',
 'prateek@cb.com']

In [34]:
word = word_tokenize(sentence)

In [35]:
word

['Send',
 'all',
 'the',
 '50',
 'document',
 'related',
 'to',
 'chapters',
 '1,2,3,4',
 'at',
 'prateek',
 '@',
 'cb.com']

## Stopwords

In [36]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))


In [37]:
print(sw)

{'ll', 'won', 'yours', 'against', 'how', "aren't", 'being', "should've", "hadn't", 'mustn', 'yourself', 'after', 'there', 'down', 'at', 'i', 'that', 'who', "shan't", 'while', 'were', 'himself', 'm', 'they', 'because', 'an', 'with', 'each', 'under', 't', "that'll", 'further', 'until', 'then', 'so', 'aren', 'ourselves', 'during', 'has', 'was', 'them', 'those', 'before', 'both', 'no', 'me', 'too', 'nor', "you're", 'above', 'here', 'yourselves', 'hadn', 'between', "she's", "shouldn't", 'weren', "you'd", 'all', 'o', 'ours', 'herself', 'been', 'out', 'in', "haven't", "mustn't", 'as', "wouldn't", 'she', 'whom', 'have', 'hasn', 'these', 'doesn', 'their', 'which', 'theirs', 'only', 'are', "wasn't", "doesn't", 'to', 'shouldn', 'hers', 'what', 'into', 'shan', 'of', 'having', 'wasn', 'myself', 'or', 'about', 'will', 'any', 'very', 've', 'you', 'if', 'wouldn', 'had', 'some', 'needn', 'again', 'for', "you'll", 'ain', "hasn't", 'itself', "weren't", 'on', 'am', "don't", 'isn', 'themselves', 'through',

In [38]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [39]:
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['bothered', 'much']


In [40]:
'not' in sw

True

## Tokenization using Regular Expression 

In [41]:
sentence = "Send all the 50 document related to chapters 1,2,3,4 at prateek@cb.com"
from nltk.tokenize import RegexpTokenizer

In [42]:
tokenizer = RegexpTokenizer('[a-zA-Z]')
useful_text = tokenizer.tokenize(sentence)

In [43]:
useful_text

['S',
 'e',
 'n',
 'd',
 'a',
 'l',
 'l',
 't',
 'h',
 'e',
 'd',
 'o',
 'c',
 'u',
 'm',
 'e',
 'n',
 't',
 'r',
 'e',
 'l',
 'a',
 't',
 'e',
 'd',
 't',
 'o',
 'c',
 'h',
 'a',
 'p',
 't',
 'e',
 'r',
 's',
 'a',
 't',
 'p',
 'r',
 'a',
 't',
 'e',
 'e',
 'k',
 'c',
 'b',
 'c',
 'o',
 'm']

In [44]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [45]:
useful_text

['Send',
 'all',
 'the',
 'document',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

## Stemming
1 process that transform particular words (verbs, plurals) into their radical form

2 preserve the semantics of the sentence without increasing the number of unique tokens

examples - jumps, jumping,jumped ,jump ==>jump

In [46]:
text   = """Foxes love to make jumpes. the quick brown fox was seen jumping over the
lovely dog from a 6th feet high wall"""

In [47]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
#Snowball Stemmer ,Porter ,Lancaster Stemmer

In [48]:
ps = PorterStemmer()

In [49]:
ps.stem('jumping')

'jump'

In [50]:
ps.stem('lovely')

'love'

In [51]:
ps.stem('loving')

'love'

In [52]:
ps.stem('jumped')

'jump'

In [53]:
# let's work with snowball stemmer
ss = SnowballStemmer('english')

In [54]:
ss.stem('jumping')

'jump'

In [55]:
'''# Lemitization
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()
wn.lemmatize('jumping')'''

"# Lemitization\nfrom nltk.stem import WordNetLemmatizer\nwn = WordNetLemmatizer()\nwn.lemmatize('jumping')"

## Building a vocab &vectorization

In [97]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]


In [98]:
from sklearn.feature_extraction.text import CountVectorizer

In [99]:
cv = CountVectorizer()

In [100]:
vectorized_corpus = cv.fit_transform(corpus)

In [101]:
vectorized_corpus = vectorized_corpus.toarray()

In [102]:
vectorized_corpus[0]

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2])

In [103]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [104]:
len(cv.vocabulary_.keys())

42

In [107]:
#reverse maping
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [108]:
s = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


## Vectorization with Stopword Removal

In [109]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    words = remove_stopwords(words,sw)
    return words
    

In [110]:
#myTokenizer(sentence)
#print(sentence)

In [111]:
cv = CountVectorizer(tokenizer = myTokenizer)

In [112]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [113]:
print(vectorized_corpus)


[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [114]:
print(len(vectorized_corpus[0]))

33


In [115]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [118]:

# For Test Data
test_corpus = [
        'Indian cricket rock !',        
]

In [119]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### More ways to Create features 
1 unigram -every word as a feature 

2 Bigram 

3 Trigram

4 n-grams

5 TF-IDF Normalisation 

In [120]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [121]:
cv = CountVectorizer(ngram_range=(1,3))

In [122]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()


array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [123]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

## Tf-idf Normalisation
1 Avoid features that occur very often,because they contain less information 

2 information decreases as the number of occurrences increases across different type of document 

3 so we define another term - term-document-frequency which associates a weight with every term

In [130]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [132]:
tfidf = TfidfVectorizer()

In [133]:
vc = tfidf.fit_transform(corpus).toarray()

In [134]:

print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [135]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}