### 7.2. Bag of Words Approach

In [14]:
corpus = ["He likes to watch movies",
          "French movies are good to watch movies",
          "Do you like French movies?"]


from sklearn.feature_extraction.text import CountVectorizer
bog_vectorizer = CountVectorizer()

bog_vectorizer.fit(corpus)
print(bog_vectorizer.vocabulary_)

{'he': 4, 'likes': 6, 'to': 8, 'watch': 9, 'movies': 7, 'french': 2, 'are': 0, 'good': 3, 'do': 1, 'you': 10, 'like': 5}


In [15]:
bog_vectors =  bog_vectorizer.transform(corpus)
for i in range(len(corpus)):
    print(corpus[i],"-->",bog_vectors[i].toarray())

He likes to watch movies --> [[0 0 0 0 1 0 1 1 1 1 0]]
French movies are good to watch movies --> [[1 0 1 1 0 0 0 2 1 1 0]]
Do you like French movies? --> [[0 1 1 0 0 1 0 1 0 0 1]]


#### Removing Stop Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
bog_vectorizer = CountVectorizer(stop_words=["to", "you", "are", "he", "do"])

bog_vectorizer.fit(corpus)
print(bog_vectorizer.vocabulary_)

{'likes': 3, 'watch': 5, 'movies': 4, 'french': 0, 'good': 1, 'like': 2}


In [18]:
bog_vectors =  bog_vectorizer.transform(corpus)
for i in range(len(corpus)):
    print(corpus[i],"-->",bog_vectors[i].toarray())

He likes to watch movies --> [[0 0 0 1 1 1]]
French movies are good to watch movies --> [[1 1 0 0 2 1]]
Do you like French movies? --> [[1 0 1 0 1 0]]


### 7.3. N-Grams Approach

In [21]:
corpus = ["He likes to watch movies",
          "French movies are good to watch movies",
          "Do you like French movies?"]


from sklearn.feature_extraction.text import CountVectorizer
ng_vectorizer = CountVectorizer(ngram_range=(2,2))

ng_vectorizer.fit(corpus)
print(ng_vectorizer.vocabulary_)

{'he likes': 4, 'likes to': 6, 'to watch': 8, 'watch movies': 9, 'french movies': 2, 'movies are': 7, 'are good': 0, 'good to': 3, 'do you': 1, 'you like': 10, 'like french': 5}


In [22]:
ng_vectors =  ng_vectorizer.transform(corpus)
for i in range(len(corpus)):
    print(corpus[i],"-->",ng_vectors[i].toarray())

He likes to watch movies --> [[0 0 0 1 1 1]]
French movies are good to watch movies --> [[1 1 0 0 2 1]]
Do you like French movies? --> [[1 0 1 0 1 0]]


### 7.4. TF-IDF Approach

In [37]:
corpus = ["He likes to watch movies",
          "French movies are good to watch movies",
          "Do you like French movies?"]


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_vectorizer.fit(corpus)
print(tfidf_vectorizer.vocabulary_)

{'likes': 3, 'watch': 5, 'movies': 4, 'french': 0, 'good': 1, 'like': 2}


In [38]:
tfidf_vectors =  tfidf_vectorizer.transform(corpus)
for i in range(len(corpus)):
    print(corpus[i],"-->",tfidf_vectors[i].toarray())

He likes to watch movies --> [[0.         0.         0.         0.72033345 0.42544054 0.54783215]]
French movies are good to watch movies --> [[0.40352536 0.53058735 0.         0.         0.62674687 0.40352536]]
Do you like French movies? --> [[0.54783215 0.         0.72033345 0.         0.42544054 0.        ]]


In [39]:
print(tfidf_vectorizer.get_feature_names())

['french', 'good', 'like', 'likes', 'movies', 'watch']


In [41]:
import pandas as pd
df=pd.DataFrame(tfidf_vectorizer.idf_,index=['french', 'good', 'like', 'likes', 'movies', 'watch'],columns=['idf'])
df

Unnamed: 0,idf
french,1.287682
good,1.693147
like,1.693147
likes,1.693147
movies,1.0
watch,1.287682


In [42]:
import pandas as pd
df=pd.DataFrame(tfidf_vectorizer.idf_,index=['french', 'good', 'like', 'likes', 'movies', 'watch'],columns=['tf'])
df

Unnamed: 0,tf
french,1.287682
good,1.693147
like,1.693147
likes,1.693147
movies,1.0
watch,1.287682


In [43]:
df1=pd.DataFrame(tfidf_vectors.todense(),columns=['french', 'good', 'like', 'likes', 'movies', 'watch'])
df1

Unnamed: 0,french,good,like,likes,movies,watch
0,0.0,0.0,0.0,0.720333,0.425441,0.547832
1,0.403525,0.530587,0.0,0.0,0.626747,0.403525
2,0.547832,0.0,0.720333,0.0,0.425441,0.0


### 7.5. Word2Vec

In [45]:
# Open a file: file
file = open("E:\Datasets\simple_textfile.txt",mode='r')
 

corpus = file.read()
 
file.close()

In [48]:
import re
processed_corpus = corpus.lower()
processed_corpus = re.sub('[^a-zA-Z]', ' ', processed_corpus)
processed_corpus = re.sub(r'\s+', ' ', processed_corpus)

In [50]:
import nltk

all_sentences = nltk.sent_tokenize(processed_corpus)

all_tokens = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_tokens)):
    all_tokens[i] = [w for w in all_tokens[i] if w not in stopwords.words('english')]

In [56]:
from gensim.models import Word2Vec

custom_embeddings = Word2Vec(all_tokens, size = 50, min_count=2)

In [72]:
embedding = custom_embeddings.wv['language']
print(embedding)

[-0.00426598  0.0066667  -0.00994872 -0.0018594  -0.00368378 -0.00152758
 -0.00074282  0.00785413 -0.00063565  0.00349408  0.00979154 -0.00473177
  0.00720627 -0.00183818 -0.00537117  0.00785426  0.00251265  0.00769123
 -0.00811501 -0.00216821 -0.00799533  0.00863134 -0.00185704  0.00969618
  0.0028168  -0.00972727  0.00881161  0.00715959  0.00634548  0.00720468
  0.00580982  0.00541707  0.00326962  0.00124786 -0.00173341 -0.00210726
  0.00082567  0.00168776  0.00956614  0.007015   -0.00264531 -0.00101851
 -0.00378467 -0.0045118   0.00485406  0.00942281 -0.00058634  0.00399187
 -0.00708318 -0.00132198]


In [76]:
embedding = custom_embeddings.wv.most_similar("processing")
print(embedding)

[('research', 0.23508717119693756), ('words', 0.20266640186309814), ('g', 0.2025955319404602), ('soft', 0.17397646605968475), ('accurate', 0.15688738226890564), ('simply', 0.1476944088935852), ('large', 0.13822558522224426), ('made', 0.1043497771024704), ('data', 0.08629055321216583), ('systems', 0.08272531628608704)]


In [58]:
custom_embeddings.save("E:\Datasets\custom_embeddings.model")

In [59]:
custom_embeddings_loaded = Word2Vec.load("E:\Datasets\custom_embeddings.model")
embedding_loaded = custom_embeddings_loaded.wv['language']
print(embedding_loaded)

[-0.00426598  0.0066667  -0.00994872 -0.0018594  -0.00368378 -0.00152758
 -0.00074282  0.00785413 -0.00063565  0.00349408  0.00979154 -0.00473177
  0.00720627 -0.00183818 -0.00537117  0.00785426  0.00251265  0.00769123
 -0.00811501 -0.00216821 -0.00799533  0.00863134 -0.00185704  0.00969618
  0.0028168  -0.00972727  0.00881161  0.00715959  0.00634548  0.00720468
  0.00580982  0.00541707  0.00326962  0.00124786 -0.00173341 -0.00210726
  0.00082567  0.00168776  0.00956614  0.007015   -0.00264531 -0.00101851
 -0.00378467 -0.0045118   0.00485406  0.00942281 -0.00058634  0.00399187
 -0.00708318 -0.00132198]


### 7.6 Using Pretrained Word Embeddings

In [63]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove2word2vec(glove_input_file="E:\Datasets\glove.6B.100d.txt", word2vec_output_file="E:\Datasets\gensim_glove_vectors.txt")

(400000, 100)

In [64]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("E:\Datasets\gensim_glove_vectors.txt", binary=False)


In [70]:
glove_model['language']

array([ 0.18519 ,  0.34111 ,  0.36097 ,  0.27093 , -0.031335,  0.83923 ,
       -0.50534 , -0.80062 ,  0.40695 ,  0.82488 , -0.98239 , -0.6354  ,
       -0.21382 ,  0.079889, -0.29557 ,  0.17075 ,  0.17479 , -0.74214 ,
       -0.2677  ,  0.21074 , -0.41795 ,  0.027713,  0.71123 ,  0.2063  ,
       -0.12266 , -0.80088 ,  0.22942 ,  0.041037, -0.56901 ,  0.097472,
       -0.59139 ,  1.0524  , -0.66803 , -0.70471 ,  0.69757 , -0.11137 ,
       -0.27816 ,  0.047361,  0.020305, -0.184   , -1.0254  ,  0.11297 ,
       -0.79547 ,  0.41642 , -0.2508  , -0.3188  ,  0.37044 , -0.26873 ,
       -0.36185 , -0.096621, -0.029956,  0.67308 ,  0.53102 ,  0.62816 ,
       -0.11507 , -1.5524  , -0.30628 , -0.4253  ,  1.8887  ,  0.3247  ,
        0.60202 ,  0.81163 , -0.46029 , -1.4061  ,  0.80229 ,  0.2019  ,
        0.60938 ,  0.063545,  0.21925 , -0.043372, -0.36648 ,  0.61308 ,
        1.0207  , -0.39014 ,  0.1717  ,  0.61272 , -0.80342 ,  0.71295 ,
       -1.0938  , -0.50546 , -0.99668 , -1.6701  , 

In [68]:
glove_model.most_similar('intelligence')

[('cia', 0.742180585861206),
 ('information', 0.7210196256637573),
 ('security', 0.6963101625442505),
 ('fbi', 0.6962289810180664),
 ('military', 0.6934822201728821),
 ('secret', 0.6893364191055298),
 ('counterterrorism', 0.6762625575065613),
 ('pentagon', 0.6651185154914856),
 ('defense', 0.6564568281173706),
 ('agents', 0.6406551599502563)]

### Exercise 7.1

**Question 1:** Which of the following is not a disadvantage of Bag of Words and NGrams Approach

A. Results in a huge sparse matrix

B. Context information is not retained

C. Requires huge amount of data to train

D. None of the above

**Answer:** C


**Question 2:** Which attribute is used to specify range of N-Grams via Sklearn's CountVectorizer

A. ngrams

B. ng_rage

C. ngrams_range

D. ngram_range

**Answer:** D



**Question 2:** Suppose you develop a custom word2vec model "GensimModel" with Gensim. How will you display words similar to "Machine"?

A. GensimModel.wv.most_similar("Machine")

B. GensimModel.most_similar("Machine")

C. GensimModel.wv.similar("Machine")

D. GensimModel.similar("Machine")

**Answer:** A


### Exercise 7.2

Using the following corpus, create bag of words and TFIDF models without stopwords. Display the original words and the bag of words and TFIDF vectors:

dataset = [

    'This movie is excellent',
    'I loved the movie, it was fantastic',
    'The film is brilliant, you should watch',
    'Wonderful movie',
    'one of the best films ever',
    'fantastic film to watch',
    'great movie',
    'Acting and direction is brilliant'
]


**Solution**

In [82]:
dataset = [

    'This movie is excellent',
    'I loved the movie, it was fantastic',
    'The film is brilliant, you should watch',
    'Wonderful movie',
    'one of the best films ever',
    'fantastic film to watch',
    'great movie',
    'Acting and direction is brilliant'
]


## Bag of Words Approach

from sklearn.feature_extraction.text import CountVectorizer
bog_vectorizer = CountVectorizer(stop_words='english')

bog_vectorizer.fit(dataset)

bog_vectors =  bog_vectorizer.transform(dataset)
for i in range(len(dataset)):
    print(dataset[i],"-->",bog_vectors[i].toarray())

This movie is excellent --> [[0 0 0 0 1 0 0 0 0 0 1 0 0]]
I loved the movie, it was fantastic --> [[0 0 0 0 0 1 0 0 0 1 1 0 0]]
The film is brilliant, you should watch --> [[0 0 1 0 0 0 1 0 0 0 0 1 0]]
Wonderful movie --> [[0 0 0 0 0 0 0 0 0 0 1 0 1]]
one of the best films ever --> [[0 1 0 0 0 0 0 1 0 0 0 0 0]]
fantastic film to watch --> [[0 0 0 0 0 1 1 0 0 0 0 1 0]]
great movie --> [[0 0 0 0 0 0 0 0 1 0 1 0 0]]
Acting and direction is brilliant --> [[1 0 1 1 0 0 0 0 0 0 0 0 0]]


In [86]:
## TFIDF Approach

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_vectorizer.fit(dataset)

tfidf_vectors =  tfidf_vectorizer.transform(dataset)
for i in range(len(dataset)):
    print(dataset[i],"-->",tfidf_vectors[i].toarray())

This movie is excellent --> [[0.         0.         0.         0.         0.84453372 0.
  0.         0.         0.         0.         0.53550237 0.
  0.        ]]
I loved the movie, it was fantastic --> [[0.         0.         0.         0.         0.         0.57771936
  0.         0.         0.         0.68933838 0.43709603 0.
  0.        ]]
The film is brilliant, you should watch --> [[0.         0.         0.57735027 0.         0.         0.
  0.57735027 0.         0.         0.         0.         0.57735027
  0.        ]]
Wonderful movie --> [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.53550237 0.
  0.84453372]]
one of the best films ever --> [[0.         0.70710678 0.         0.         0.         0.
  0.         0.70710678 0.         0.         0.         0.
  0.        ]]
fantastic film to watch --> [[0.         0.         0.         0.         0.         0.57735027
  0.57735027 0.         0.         0.         0.  