# 1. Corpus

In [1]:
import nltk

In [2]:
# dependency
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# packet manager
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
from nltk.corpus import brown

In [5]:
brown?

In [6]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
data = brown.sents(categories="science_fiction")

In [8]:
" ".join(data[1])

"Self's integrity was and is and ever had been ."

# 2 Bag of words pipeline
      convert text into numeric data
- get the data/corpus
        data acquisition
- tokenisation
        break down corpus -> sentences -> words(smallest tokens)
- stopword removal
        remove words which are not relevant to the task 
- stemming / lemmatization
        convert different form of a word to the root form (loving,lover,loved -> love)
        preserve the sematics of a sentece without increasint the number of unique 
        tokens
- building a vocab
        Build a common vacab containg a list of all unique words
- vectorisation
        most easy way is count/binary vectoriztion (only based on frequency) along with 
        labels
- classification
        apply classification algo on the 
        

## 1. data corpus

In [9]:
document = """ it is a nice data . 
            i love eating pizza . 
            i love playing cricket"""

sentence = "mumbai indian won in 2019, 2017, 2015 and 2013"

## 2. tokenizer

In [10]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [11]:
word_tokenize?

In [12]:
sents = sent_tokenize(document)
print(sents)

[' it is a nice data .', 'i love eating pizza .', 'i love playing cricket']


In [13]:
words = word_tokenize(document)
print(words)

['it', 'is', 'a', 'nice', 'data', '.', 'i', 'love', 'eating', 'pizza', '.', 'i', 'love', 'playing', 'cricket']


## 3. stopwords removal

In [14]:
from nltk.corpus import stopwords

In [15]:
sw = set(stopwords.words('english'))

In [16]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [17]:
text = remove_stopwords(word_tokenize(document),sw)

In [18]:
text

['nice',
 'data',
 '.',
 'love',
 'eating',
 'pizza',
 '.',
 'love',
 'playing',
 'cricket']

## Tokenisation using regular expression
- to build a custom tokenizer instead of using word and sentence tokenizer from nltk

In [19]:
sentence = "mumbai indian won in 2019, 2017, 2015 and 2013 in India"
from nltk.tokenize import RegexpTokenizer

In [20]:
tokenizer = RegexpTokenizer('[a-zA-Z,]+')
useful_text = tokenizer.tokenize(sentence)

In [21]:
useful_text

['mumbai', 'indian', 'won', 'in', ',', ',', 'and', 'in', 'India']

## 4. Stemming
    NLTK
        snowball stemmer - multilingual stemmer (english,german etc)
        porter stemmer
        lancaster stemmer
    each stemmer is based on different rules (regex) you can make a custum stemmer as 
    well

In [22]:
text = """ foxes love to make jumps. the fox was seen jumping over the lovely dog 
            from a high 6ft wall"""

In [23]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [24]:
ps = PorterStemmer()
ss = SnowballStemmer('english')

In [25]:
# examples
print(ps.stem('jumps'))
print(ps.stem('jumping'))
print(ps.stem('lovers'))
print(ps.stem('loving'))

jump
jump
lover
love


In [26]:
# examples
print(ss.stem('jumps'))
print(ss.stem('jumping'))
print(ss.stem('lovers'))
print(ss.stem('loving'))

jump
jump
lover
love


## Lemmatization

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

In [31]:
print(wn.lemmatize('jumps'))
print(wn.lemmatize('jumping'))
print(wn.lemmatize('lovers'))
print(wn.lemmatize('loving'))

jump
jumping
lover
loving


## 5. Building vocab and vectorization

In [37]:
corpus = [
    "Oxford vaccine trial pause won't have impact on Indian trials: Serum Institute CEO",
    "Police intensify crackdown on illegal arms trade in Bihar ahead of elections",
    "IPL 2020: Virat Kohli enjoys 'another productive training' ahead of SRH clash. IPL 2020: Deepak Chahar back to team hotel after returning negative twice",
    "Decided to play cricket for Australia when I was 4-year-old: Steve Smith"
]

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
# basic count vectorizer
cv = CountVectorizer()

In [41]:
# training and transforming on our corpus
vectorized_corpus = cv.fit_transform(corpus)

In [42]:
vectorized_corpus

<4x55 sparse matrix of type '<class 'numpy.int64'>'
	with 59 stored elements in Compressed Sparse Row format>

In [43]:
# learned vector
vectorized_corpus.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1]], dtype=int64)

In [45]:
# word and index mapping
print(cv.vocabulary_)

{'oxford': 32, 'vaccine': 49, 'trial': 46, 'pause': 33, 'won': 53, 'have': 18, 'impact': 21, 'on': 31, 'indian': 23, 'trials': 47, 'serum': 38, 'institute': 24, 'ceo': 8, 'police': 35, 'intensify': 25, 'crackdown': 11, 'illegal': 20, 'arms': 4, 'trade': 44, 'in': 22, 'bihar': 7, 'ahead': 2, 'of': 29, 'elections': 15, 'ipl': 26, '2020': 0, 'virat': 50, 'kohli': 27, 'enjoys': 16, 'another': 3, 'productive': 36, 'training': 45, 'srh': 40, 'clash': 10, 'deepak': 14, 'chahar': 9, 'back': 6, 'to': 43, 'team': 42, 'hotel': 19, 'after': 1, 'returning': 37, 'negative': 28, 'twice': 48, 'decided': 13, 'play': 34, 'cricket': 12, 'for': 17, 'australia': 5, 'when': 52, 'was': 51, 'year': 54, 'old': 30, 'steve': 41, 'smith': 39}


In [46]:
# inverse transformation
cv.inverse_transform(vectorized_corpus[0])

[array(['oxford', 'vaccine', 'trial', 'pause', 'won', 'have', 'impact',
        'on', 'indian', 'trials', 'serum', 'institute', 'ceo'],
       dtype='<U10')]

In [47]:
# number of vocab (unique)
len(cv.vocabulary_.keys())

55

## Vectorisation with stopword removal

In [52]:
def myTokenizer(document):
    
    # nltk inbuilt tokenizer
    words = word_tokenize(document.lower())
    words = remove_stopwords(words,sw)
    
    return words
    

In [54]:
myTokenizer("IPL 2020: Virat Kohli enjoys 'another productive training' ahead of SRH clash.")

['ipl',
 '2020',
 ':',
 'virat',
 'kohli',
 'enjoys',
 "'another",
 'productive',
 'training',
 "'",
 'ahead',
 'srh',
 'clash',
 '.']

In [55]:
# custom count vectorizer
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)

In [56]:
vectorized_corpus.toarray()[0]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1], dtype=int64)

### 33 < 55 reduced dimentionality (we are only keeping the meaningfull words)

In [57]:
# < 55 reduced dimentionality (we are only keeping the meaningfull words)
len(vectorized_corpus.toarray()[0])

49

In [58]:
cv.inverse_transform(vectorized_corpus[0])

[array(['oxford', 'vaccine', 'trial', 'pause', 'wo', "n't", 'impact',
        'indian', 'trials', ':', 'serum', 'institute', 'ceo'], dtype='<U10')]

In [63]:
### we have to covert the test corpus in a compatable format as the train corpus

In [59]:
test = [
    "virat and deepak with srh"
]

In [61]:
test = cv.transform(test)

In [62]:
test.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0]], dtype=int64)

In [65]:
print(cv.vocabulary_)

{'oxford': 30, 'vaccine': 46, 'trial': 43, 'pause': 31, 'wo': 48, "n't": 28, 'impact': 22, 'indian': 23, 'trials': 44, ':': 5, 'serum': 36, 'institute': 24, 'ceo': 11, 'police': 33, 'intensify': 25, 'crackdown': 14, 'illegal': 21, 'arms': 7, 'trade': 41, 'bihar': 10, 'ahead': 6, 'elections': 18, 'ipl': 26, '2020': 3, 'virat': 47, 'kohli': 27, 'enjoys': 19, "'another": 1, 'productive': 34, 'training': 42, "'": 0, 'srh': 38, 'clash': 13, '.': 2, 'deepak': 17, 'chahar': 12, 'back': 9, 'team': 40, 'hotel': 20, 'returning': 35, 'negative': 29, 'twice': 45, 'decided': 16, 'play': 32, 'cricket': 15, 'australia': 8, '4-year-old': 4, 'steve': 39, 'smith': 37}


# More ways to create features
- Unigram (done above)
        every word is treated as a feature
- bigram
        two consecutive words are treated as a feature helpful in capturing things like 
        negation 
- trigram
        three consecutive words are treated as a feature
- n-gram
       lower_windsow to upper_window (1,2,3...n) combination of all consecutive words 
       are treated as a feature till nth gram is considers
- tf-idf
    

### bi gram and tri gram

In [66]:
# bigram
cv = CountVectorizer(ngram_range=(2,2))

In [67]:
docs = [
    "this is a good movie",
    "this is not a good movie"
]

In [68]:
cv.fit_transform(docs)

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [69]:
cv.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

### n-gram

In [71]:
# n-grams
cv = CountVectorizer(ngram_range=(1,3))

In [72]:
cv.fit_transform(docs)

<2x15 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [73]:
cv.vocabulary_

{'this': 11,
 'is': 2,
 'good': 0,
 'movie': 7,
 'this is': 12,
 'is good': 3,
 'good movie': 1,
 'this is good': 13,
 'is good movie': 4,
 'not': 8,
 'is not': 5,
 'not good': 9,
 'this is not': 14,
 'is not good': 6,
 'not good movie': 10}

## Tf-idf Normalisation
   - Avoid features that occur very often in the corpus, because they contain/represent very less info
   - information provided by a feature decreases as the number of occurence increases across different type of documents (en
   - we will give more importance to words/ features that defines the type of document better
   - term frequency - inverse document frequecy 
            accociates a weights with every term

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
corpus = [
    "this is a good movie",
    "this was a very good movie",
    "this was not a good movie",
    "this was a pathetic movie, not a good experience"
]

In [77]:
tfidf = TfidfVectorizer()

In [78]:
vc = tfidf.fit_transform(corpus)

In [80]:
# weigths given to features
print(vc.toarray())

[[0.         0.38713857 0.74187006 0.38713857 0.         0.
  0.38713857 0.         0.        ]
 [0.         0.34989318 0.         0.34989318 0.         0.
  0.34989318 0.67049706 0.42796959]
 [0.         0.38408524 0.         0.38408524 0.58028582 0.
  0.38408524 0.         0.46979139]
 [0.50991489 0.26609474 0.         0.26609474 0.40202276 0.50991489
  0.26609474 0.         0.32547207]]


In [82]:
tfidf.vocabulary_

{'this': 6,
 'is': 2,
 'good': 1,
 'movie': 3,
 'was': 8,
 'very': 7,
 'not': 4,
 'pathetic': 5,
 'experience': 0}

## All these diffent methods of vectorisation can bes used to feed to any classification method