## Corpus

In [1]:
import nltk
from nltk.corpus import brown

In [2]:
# eg : You have emotion given. Find which category it belongs to
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
# To find sentences that belong to class mystery
data = brown.sents(categories=["mystery"])

In [5]:
# each list is a sentence
data

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [6]:
# number of sentences
len(data)

3886

In [7]:
data[0]

['There',
 'were',
 'thirty-eight',
 'patients',
 'on',
 'the',
 'bus',
 'the',
 'morning',
 'I',
 'left',
 'for',
 'Hanover',
 ',',
 'most',
 'of',
 'them',
 'disturbed',
 'and',
 'hallucinating',
 '.']

In [8]:
# Replace each separator with " "
" ".join(data[0])

'There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .'

## Tokenization

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
document = """ It was a very good movie. The cast was amazing and I liked the story.
I went to the movie hall to see it.
"""

sentence = "Code for Cause is too OP kunal@codeforcause.org"

In [11]:
sents = sent_tokenize(document)
print(sents)

[' It was a very good movie.', 'The cast was amazing and I liked the story.', 'I went to the movie hall to see it.']


In [12]:
len(sents)

3

In [13]:
words = word_tokenize(sentence) # also break down special characters
                                # eg kunal@codeforcause.org was broken down into Kunal and codeforcause.org
print(words)
print(len(words))

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal', '@', 'codeforcause.org']
9


## Stop Word Removal

In [14]:
from nltk.corpus import stopwords

In [15]:
# set to show in ordered way
swords = set(stopwords.words('english'))

In [16]:
swords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [17]:
text = "i am not a very good cricket player".split()
print(text)

['i', 'am', 'not', 'a', 'very', 'good', 'cricket', 'player']


In [18]:
def remove_stopwords(text, stopwords):
    # every word in text that is not a stopword
    useful = [w for w in text if w not in stopwords]
    return useful

In [19]:
useful_word = remove_stopwords(text,swords)

In [20]:
useful_word

['good', 'cricket', 'player']

In [21]:
# Tokenization using RegEx

In [22]:
sent = "Code for Cause is too OP kunal@codeforcause.org"

In [23]:
from nltk.tokenize import RegexpTokenizer

In [24]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful = tokenizer.tokenize(sent)
useful

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal@codeforcause.org']

## Stemming

In [25]:
# nltk provides us: Porter, Snowball, Lancaster stemmers

In [26]:
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [27]:
ps = PorterStemmer()

In [28]:
ps.stem('running')

'run'

In [29]:
# SnowballStemmer = Multilingul, supports other langs also.

## Building Vocabulary

In [30]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner.',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
# Convert a collection of text documents to a matrix of token counts

# This implementation produces a sparse representation of the counts.

# If you do not provide an a-priori dictionary and you do not use an analyzer
# that does some kind of feature selection then the number of features will
# be equal to the vocabulary size found by analyzing the data.
cv = CountVectorizer()

In [33]:
# Learn the vocabulary dictionary and return document-term matrix. 
# This is equivalent to fit followed by transform, but more efficiently
# implemented.

# Parameters
# ----------
# raw_documents : iterable
  
# Returns
# -------
# X : array of shape (n_samples, n_features)
#     Document-term matrix.

vc = cv.fit_transform(corpus)

In [34]:
# 4 => number of sentences/examples
# 55 = number of features OR the vocabulary size OR the number of important unique words
vc

<4x55 sparse matrix of type '<class 'numpy.int64'>'
	with 65 stored elements in Compressed Sparse Row format>

In [35]:
# Return a dense ndarray representation of this matrix.
# Returns
# -------
# arr : ndarray, 2-dimensional
#    An array with the same shape and containing the same
#    data represented by the sparse matrix
vc.toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 2, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0],
       [2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 2, 2, 1, 0,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 4, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 2, 0, 1, 0, 0, 2, 1, 0, 0]], dtype=int64)

In [36]:
vc

<4x55 sparse matrix of type '<class 'numpy.int64'>'
	with 65 stored elements in Compressed Sparse Row format>

In [37]:
# This is the vocabulary created of the sentences
# eg: according to this vocabulary, 'Dan' is at 9th index in vocabulary, 'Morgan' is at 27th index
print(cv.vocabulary_)
# _ after indicates this has been created after fit transform
print(len(cv.vocabulary_))

{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 51, 'tired': 45, 'enough': 11, 'at': 4, 'went': 52, 'simply': 36, 'because': 7, 'too': 48, 'exhausted': 12, 'stay': 40, 'awake': 5}
55


In [38]:
# 0=> First sentence
# It means in First sentence, the word present at 9th index in the vocabulary appears one time
print(vc[0])

  (0, 9)	1
  (0, 27)	1
  (0, 47)	1
  (0, 21)	1
  (0, 19)	1
  (0, 54)	1
  (0, 15)	1
  (0, 1)	1
  (0, 49)	1


In [39]:
# Another example: In Second sentence, element at 33rd index is present twice.
# From vocab, element at 33rd index is 'of'.
# We can see in second sentence, 'of' appears twice
print(vc)

  (0, 9)	1
  (0, 27)	1
  (0, 47)	1
  (0, 21)	1
  (0, 19)	1
  (0, 54)	1
  (0, 15)	1
  (0, 1)	1
  (0, 49)	1
  (1, 19)	1
  (1, 1)	1
  (1, 39)	1
  (1, 53)	1
  (1, 50)	1
  (1, 24)	1
  (1, 42)	2
  (1, 26)	1
  (1, 33)	2
  (1, 29)	1
  (1, 44)	1
  (1, 0)	1
  (1, 43)	1
  (1, 8)	1
  (1, 30)	1
  (1, 17)	1
  :	:
  (2, 25)	1
  (2, 41)	1
  (2, 32)	1
  (2, 13)	1
  (2, 3)	1
  (2, 23)	1
  (2, 31)	1
  (3, 19)	4
  (3, 29)	1
  (3, 46)	2
  (3, 37)	1
  (3, 41)	1
  (3, 23)	1
  (3, 16)	1
  (3, 51)	2
  (3, 45)	1
  (3, 11)	1
  (3, 4)	1
  (3, 52)	1
  (3, 36)	1
  (3, 7)	1
  (3, 48)	1
  (3, 12)	1
  (3, 40)	1
  (3, 5)	1


In [40]:
# Previously, just showed the value, not assigned it
vc = vc.toarray()

In [41]:
numbers = vc[2] 

In [42]:
# The vocab has been built according to dictionary in sorted order
# eg. Here, first value of second sentence is 2.
# Now, the first value corresponds to 'AND' according to vocabulary.
# And in second sentence, 'AND' appears twice
print(numbers)

[2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0
 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [43]:
print(len(numbers))

55


In [44]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove the stopwords, as currently the vocabulary have stopwords
    words = remove_stopwords(words, swords)
    return words

In [45]:
myTokenizer('This is a random text')

['random', 'text']

In [46]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [47]:
# Vectorized corpus
vc = cv.fit_transform(corpus).toarray()

In [50]:
# Remember order doesnt matter
print(vc)

[[0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]


In [48]:
# Now the length of vocabulary has been reduced
len(vc[0])

33

In [49]:
# This has been created using fit_transform
# Fit is going to learn from the data
# Transform is going to convert it into Vectorized format
print(cv.vocabulary_)

{'dan': 6, 'morgan': 16, 'told': 28, 'would': 32, 'forget': 11, 'ann': 1, 'turner.': 29, 'sometimes': 24, 'woke': 31, 'middle': 15, 'night': 18, 'thinking': 26, 'could': 5, 'get': 13, 'back': 4, 'sleep': 23, '.': 0, 'plans': 20, 'dreams': 7, 'revolved': 21, 'around': 2, 'much': 17, 'long': 14, 'felt': 10, 'nothing': 19, 'found': 12, 'tired': 27, 'enough': 8, 'went': 30, 'simply': 22, 'exhausted': 9, 'stay': 25, 'awake': 3}


In [51]:
# We call fit transfrom only for the first time.
# Dont call it on the test data. Otherwise it will learn from it and create new vocabulary.
len(cv.transform([sent]).toarray()[0])

33

In [52]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [53]:
len(cv.fit_transform([sent]).toarray()[0])

4

In [54]:
cv.vocabulary_

{'code': 1, 'cause': 0, 'op': 3, 'kunal@codeforcause.org': 2}