In [2]:
# pip install nltk

In [3]:
example = 'It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits.'



In [4]:
print(example)

It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits.


In [5]:
from nltk.tokenize import word_tokenize

In [10]:
words = word_tokenize(example)

In [11]:
words

['It',
 'was',
 'a',
 'very',
 'pleasant',
 'day',
 '.',
 'The',
 'weather',
 'was',
 'cool',
 'and',
 'there',
 'were',
 'light',
 'showers',
 '.',
 'I',
 'went',
 'to',
 'the',
 'market',
 'to',
 'buy',
 'some',
 'fruits',
 '.']

## Stopwords Removal

In [12]:
import nltk

In [13]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/mohit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/mohit/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [17]:
from nltk.corpus import stopwords

In [26]:
sw = stopwords.words('english')

In [27]:
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [32]:
def remove_stopwords(text):
    useful_words = []
    
    for word in text:
        if word.lower() not in sw:
            useful_words.append(word)
            
    return useful_words

In [34]:
remove_stopwords(words)

['pleasant',
 'day',
 '.',
 'weather',
 'cool',
 'light',
 'showers',
 '.',
 'went',
 'market',
 'buy',
 'fruits',
 '.']

In [38]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at mohit@codingminutes.com"

sentence = word_tokenize(sentence)
print(sentence)
sentence = remove_stopwords(sentence)
print(sentence)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'mohit', '@', 'codingminutes.com']
['Send', '50', 'documents', 'related', 'chapters', '1,2,3', 'mohit', '@', 'codingminutes.com']


### Stemming

In [53]:
text= """Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall"""

In [43]:
print(text)

Foxes love to make jumps.The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall


In [44]:
from nltk.stem import SnowballStemmer

In [45]:
ss  = SnowballStemmer('english')

In [46]:
ss.stem('loved')

'love'

In [47]:
ss.stem('love')

'love'

In [48]:
ss.stem('jumping')

'jump'

In [49]:
ss.stem('jumped')

'jump'

In [62]:
def stemming(data):
    words = word_tokenize(data)
    
    useful_words = []
    
    for word in words:
        useful_words.append( ss.stem(word) )
    
    return useful_words    

In [63]:
stemming(text)

['fox',
 'love',
 'to',
 'make',
 'jump',
 '.',
 'the',
 'quick',
 'brown',
 'fox',
 'was',
 'seen',
 'jump',
 'over',
 'the',
 'love',
 'dog',
 'from',
 'a',
 '6ft',
 'feet',
 'high',
 'wall']

# Data Cleaning/Preprocessing

In [65]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [80]:
def data_cleaning(document):
    document = document.lower()
    words = word_tokenize(document)
    
    # list comprehension
    # stopword removal + stemming
    words = [ss.stem(w) for w in words if w not in sw and len(w)>1]
    
    # convert tokens into string
    cleaned = " ".join(words)
    
    return cleaned

In [81]:
data_cleaning('Indian loving cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.')

'indian love cricket team win world cup say capt virat koh world cup held sri lanka'

In [84]:
cleaned_dataset = []

for document in corpus:
    c = data_cleaning(document)
    cleaned_dataset.append(c)

In [85]:
cleaned_dataset

['indian cricket team win world cup say capt virat koh world cup held sri lanka',
 'win next lok sabha elect say confid indian pm',
 'nobel laurat heart peopl',
 'movi raazi excit indian spi thriller base upon real stori']

# Building a Vocab & Vectorization

In [87]:
# Bag of word
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
cv = CountVectorizer()

In [92]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [93]:
type(vectorized_corpus)

numpy.ndarray

In [94]:
vectorized_corpus.shape

(4, 42)

In [95]:
vectorized_corpus

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [98]:
feature_names = cv.get_feature_names()

In [99]:
import pandas as pd

In [101]:
df = pd.DataFrame(vectorized_corpus, columns= feature_names)

In [102]:
df

Unnamed: 0,an,at,based,be,capt,confident,cricket,cup,elections,exciting,...,the,thriller,upon,virat,we,will,win,wins,won,world
0,0,1,0,1,1,0,1,2,0,0,...,0,0,0,1,0,2,0,1,0,2
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,1,0
3,1,0,1,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0


In [103]:
test = "mohit confident virat an at world icecream mobile"

In [107]:
test_vectorized = cv.transform([test]).toarray()

In [108]:
test_vectorized

array([[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

In [109]:
test_vectorized.shape

(1, 42)