### Bag of words approach: Takes in all the text, selects the unique words and then creates a vector out of it.
### Application: Document classification

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_x = ['I love the book', 'this is a great book', 'the fit is great', 'I love the shoes']

In [3]:
vectorizer = CountVectorizer(binary = True) # binary = true keeps the elements of vectors as 1's and 0's. Otherwise say if ,,
# a certain word had appeared twice then the corresponding entry in the vector would have been 2

In [4]:
vectors = vectorizer.fit_transform(train_x) 

In [5]:
vectors.toarray() # converting sparse matrix into normal array using toarray() method

array([[1, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0]], dtype=int64)

In [6]:
# to get the set of unique features
vectorizer.get_feature_names()

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']

In [7]:
# now lets build a simple model that classifies the above sentences into categories books and clothing
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"

In [8]:
# creating labels for the test cases
train_y = [Category.BOOKS, Category.BOOKS,Category.CLOTHING,Category.CLOTHING]

In [9]:
# The best way to classify text is by using linear SVM
from sklearn import svm

In [10]:
from sklearn.svm import SVC

In [11]:
model = SVC(kernel = 'linear')

In [78]:
model.fit(vectors,train_y)

SVC(kernel='linear')

In [79]:
test = vectorizer.transform(['I like the book'])

In [80]:
model.predict(test.toarray())

array(['BOOKS'], dtype='<U8')

In [15]:
test_1 = vectorizer.transform(['cardigans are great'])

In [16]:
model.predict(test_1.toarray())

array(['CLOTHING'], dtype='<U8')

In [17]:
# now the above approach is called unigram as its only taking a single word as token
# we can also do a bigram approach where it takes two words as a token. this approach is highly useful in sentinment analysis
# great might be classified as positve. But what if it was actually 'not great' in the sentence. In such cases bigram approach is good.
# vectors = vectorizer.fit_transform(train_x,ngram_range = (1,2)) -> token with one and two words 

### Word2Vectors Approach: It tries to capture the semantic meaning of the words in the vector. How it catches the semantic meaning is by catching the window of text. Say for example ''Best book I've read in years" -> the word2vec would catch Best book I've read. 

In [18]:
#!pip install spacy
import spacy

In [19]:
import sys
!{sys.executable} -m spacy download en_core_web_md

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_md')


In [20]:
nlp = spacy.load(r'C:\Users\anuj8\Anaconda3\Lib\site-packages\en_core_web_md\en_core_web_md-2.3.1') # md stands for medium sized model

In [21]:
docs = [nlp(text) for text in train_x]

In [22]:
docs

[I love the book, this is a great book, the fit is great, I love the shoes]

In [23]:
print(docs[0].vector) # this gives the vector representation of the text 'I love the book'

[ 0.08563001  0.313255   -0.2392405  -0.17215225  0.1418515   0.1970548
  0.04868999 -0.12744625  0.05947001  2.1347     -0.61964     0.01162549
  0.29980502 -0.125354    0.017935   -0.1355105  -0.27094752  1.1129825
 -0.16986902 -0.0266875   0.14768225 -0.16372526  0.121907   -0.06876825
 -0.061945    0.08704174 -0.2005705  -0.24039775 -0.0675595   0.0926495
 -0.13526568  0.24121101 -0.20299     0.30007     0.11574501  0.055062
  0.013516   -0.0664179  -0.3380587  -0.17823698 -0.01039225  0.03333575
 -0.10241525 -0.093445    0.09327275  0.20661727 -0.15074751  0.14018372
  0.23520125 -0.05192125 -0.0999365  -0.1212635  -0.05895525 -0.005062
  0.06003174  0.01213001 -0.11257375 -0.24570274  0.00678    -0.1888345
 -0.09276348 -0.25614128 -0.20717824  0.0858725  -0.02215025 -0.303222
 -0.00274375  0.11888     0.02695867  0.20738849  0.02140525 -0.0175935
  0.1513575  -0.0032025   0.20425075  0.16609626 -0.084585   -0.0744465
 -0.1083965   0.14420825  0.13595775  0.2158625   0.15477975 -0

In [24]:
train_vectors = [docs[i].vector for i in range(len(docs))]

In [25]:
train_vectors

[array([ 0.08563001,  0.313255  , -0.2392405 , -0.17215225,  0.1418515 ,
         0.1970548 ,  0.04868999, -0.12744625,  0.05947001,  2.1347    ,
        -0.61964   ,  0.01162549,  0.29980502, -0.125354  ,  0.017935  ,
        -0.1355105 , -0.27094752,  1.1129825 , -0.16986902, -0.0266875 ,
         0.14768225, -0.16372526,  0.121907  , -0.06876825, -0.061945  ,
         0.08704174, -0.2005705 , -0.24039775, -0.0675595 ,  0.0926495 ,
        -0.13526568,  0.24121101, -0.20299   ,  0.30007   ,  0.11574501,
         0.055062  ,  0.013516  , -0.0664179 , -0.3380587 , -0.17823698,
        -0.01039225,  0.03333575, -0.10241525, -0.093445  ,  0.09327275,
         0.20661727, -0.15074751,  0.14018372,  0.23520125, -0.05192125,
        -0.0999365 , -0.1212635 , -0.05895525, -0.005062  ,  0.06003174,
         0.01213001, -0.11257375, -0.24570274,  0.00678   , -0.1888345 ,
        -0.09276348, -0.25614128, -0.20717824,  0.0858725 , -0.02215025,
        -0.303222  , -0.00274375,  0.11888   ,  0.0

In [26]:
word2vec = svm.SVC(kernel = 'linear')
word2vec.fit(train_vectors,train_y)

SVC(kernel='linear')

In [27]:
word2vec.predict([nlp('This is an awesome story').vector]) # this approach is now catching the semantics also

array(['BOOKS'], dtype='<U8')

In [28]:
word2vec.predict([nlp('These earings hurt bro').vector])

array(['CLOTHING'], dtype='<U8')

### Regexes

In [29]:
# example1 : start with ab, any characters in between end with cd
import re
exp = re.compile(r'^ab\S*cd$')

In [30]:
phrases = ['aabbcd','abcbde','aahghf','abcd']

In [31]:
count = 0
for phrase in phrases:
    if re.match(exp,phrase):
        count = count+1

In [32]:
count

1

In [33]:
# example 2: check if the text contains eiher read, story, book
exp_1 = re.compile(r'read|story|book')

In [34]:
text = 'Surprise motherfuckers'
if re.search(exp_1,text):
    print('YES')

In [35]:
test = 'I treaded that history'
if re.search(exp_1,test):
    print('YES')  # even though the string does not contain the words read,story,book explicitly.It still returns yes.

YES


In [36]:
# To overcome above drawback, we can use word boundary '\b'
exp_2 = re.compile(r'\bread | \bstory | \bbook')

In [37]:
test_ = 'I treaded that history'
if re.search(exp_2,test_):
    print('YES')
else:
    print('No')
    

No


### Stemming and Lemmatization - Used for normalizing the text.

In [38]:
# Stemming: books - book , reading - read, stories - stori ( this is where stemming fails. It does not guarantee actual
# english word)
# Lemmaztization is a better alternative and guarantees actual english word

In [39]:
import nltk

In [40]:
nltk.download('stopwords') # examples of stopwords: this, that, the, a ,an etc
nltk.download('wordnet') # list of some common words
nltk.download('punkt') # it is a tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anuj8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anuj8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anuj8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [42]:
stemmer = PorterStemmer()

In [43]:
phrase = 'Reading the books'

In [44]:
x = word_tokenize(phrase)

In [45]:
for items in x:
    print(stemmer.stem(items))


read
the
book


In [46]:
from nltk.stem import WordNetLemmatizer

In [47]:
lemmatizer = WordNetLemmatizer()

In [48]:
y = 'I hate love stories'
y1 = word_tokenize(y)

In [49]:
for items in y1:
    print(lemmatizer.lemmatize(items,pos = 'n')) # pos is part of speech tagging and we can add according to our need
    

I
hate
love
story


In [50]:
from nltk.corpus import stopwords

In [53]:
words = stopwords.words('english')

In [54]:
words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [55]:
phrase = 'Here is an example demonstrating removal of stopwords'

In [56]:
tokens = word_tokenize(phrase)

In [58]:
clean = []
for token in tokens:
    if token not in words:
        clean.append(token)

In [59]:
clean

['Here', 'example', 'demonstrating', 'removal', 'stopwords']

In [61]:
" ".join(clean)

'Here example demonstrating removal stopwords'

### Other(spell correction, sentiment, part of speech tagging)

In [63]:
from textblob import TextBlob

In [74]:
# Text on which we are working always need to be passed to TextBlob object
phrase = 'this is a very good but bad phrase'
tb_phrase = TextBlob(phrase)

In [75]:
tb_phrase.correct()

TextBlob("this is a very good but bad phrase")

In [76]:
tb_phrase.tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('but', 'CC'),
 ('bad', 'JJ'),
 ('phrase', 'NN')]

In [77]:
tb_phrase.sentiment

Sentiment(polarity=0.10500000000000004, subjectivity=0.7233333333333334)