##Bag of words 

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most',
   'older people are at high risk due to this disease']

In [4]:
cv = CountVectorizer()

In [5]:
X = cv.fit_transform(sents)
X = X.toarray()

In [6]:
X

array([[0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1]])

In [7]:
sorted(cv.vocabulary_.keys())

['affects',
 'are',
 'at',
 'coronavirus',
 'disease',
 'due',
 'high',
 'highly',
 'infectious',
 'is',
 'most',
 'older',
 'people',
 'risk',
 'the',
 'this',
 'to']

In [8]:
cv = CountVectorizer(ngram_range=(2,2))

In [9]:
X = cv.fit_transform(sents)
X = X.toarray()

In [10]:
sorted(cv.vocabulary_.keys())

['affects older',
 'are at',
 'at high',
 'coronavirus affects',
 'coronavirus is',
 'due to',
 'high risk',
 'highly infectious',
 'infectious disease',
 'is highly',
 'older people',
 'people are',
 'people the',
 'risk due',
 'the most',
 'this disease',
 'to this']

In [11]:
X

array([[0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1]])

##TFIDF  - Term frequency and inverse document frequency

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most',
   'older people are at high risk due to this disease']

In [14]:
tfidf = TfidfVectorizer()

In [15]:
transformed = tfidf.fit_transform(sents)

In [16]:
transformed.toarray()

array([[0.        , 0.        , 0.        , 0.37302199, 0.37302199,
        0.        , 0.        , 0.49047908, 0.49047908, 0.49047908,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.45954803, 0.        , 0.        , 0.34949812, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.45954803, 0.34949812, 0.34949812, 0.        , 0.45954803,
        0.        , 0.        ],
       [0.        , 0.338348  , 0.338348  , 0.        , 0.25732238,
        0.338348  , 0.338348  , 0.        , 0.        , 0.        ,
        0.        , 0.25732238, 0.25732238, 0.338348  , 0.        ,
        0.338348  , 0.338348  ]])

In [17]:
import pandas as pd

In [18]:
df = pd.DataFrame(transformed[0].T.todense(),
    	index=tfidf.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)



In [19]:
df

Unnamed: 0,TF-IDF
infectious,0.490479
highly,0.490479
is,0.490479
coronavirus,0.373022
disease,0.373022
older,0.0
this,0.0
the,0.0
risk,0.0
people,0.0


##WORD2VEC

In [20]:
#cbow

In [21]:
from gensim import models

In [22]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')


In [23]:
vect = wv['healthy']

In [24]:
vect

array([-0.14550781,  0.24804688, -0.15136719,  0.24609375, -0.14746094,
        0.04296875,  0.03515625, -0.01019287,  0.06103516, -0.12597656,
       -0.16796875, -0.078125  ,  0.12792969,  0.21777344, -0.08447266,
        0.01904297,  0.1484375 ,  0.13085938, -0.14941406,  0.12451172,
        0.18847656,  0.09082031, -0.18359375,  0.14453125, -0.10009766,
       -0.07714844, -0.3515625 ,  0.09082031,  0.00494385, -0.02148438,
       -0.02270508, -0.07080078,  0.19824219, -0.14648438, -0.11376953,
        0.08300781, -0.03857422, -0.15136719, -0.15429688,  0.25195312,
        0.12988281, -0.2734375 , -0.14550781,  0.03613281, -0.02441406,
       -0.05004883,  0.09667969,  0.05786133,  0.13085938,  0.04296875,
       -0.06079102, -0.06347656, -0.20703125,  0.08740234,  0.03466797,
       -0.06982422,  0.05371094, -0.31445312, -0.12988281, -0.20507812,
       -0.08886719,  0.13476562, -0.33984375,  0.09277344,  0.16210938,
        0.03271484, -0.05981445,  0.05297852, -0.03295898,  0.03

In [28]:
vect.shape

(300,)

In [37]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most',
   'older people are at high risk due to this disease']

In [38]:
sents = [sent.split() for sent in sents]

In [39]:
sents

[['coronavirus', 'is', 'a', 'highly', 'infectious', 'disease'],
 ['coronavirus', 'affects', 'older', 'people', 'the', 'most'],
 ['older',
  'people',
  'are',
  'at',
  'high',
  'risk',
  'due',
  'to',
  'this',
  'disease']]