## Bag of Words

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({"text":["Word is very beautiful",
                         "Everybody loves to travel",
                         "people write travel diaries",
                          "I love going out with friends"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,Word is very beautiful,1
1,Everybody loves to travel,1
2,people write travel diaries,0
3,I love going out with friends,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
bow = cv.fit_transform(df["text"])

In [None]:
bow

<4x16 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [None]:
cv.vocabulary_

{'word': 14,
 'is': 5,
 'very': 12,
 'beautiful': 0,
 'everybody': 2,
 'loves': 7,
 'to': 10,
 'travel': 11,
 'people': 9,
 'write': 15,
 'diaries': 1,
 'love': 6,
 'going': 4,
 'out': 8,
 'with': 13,
 'friends': 3}

In [None]:
bow.toarray()

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]])

In [None]:
bow[0].toarray()

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]])

In [None]:
cv.transform(['I want to travel world']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

## N-grams/bi-grams/n-grams

In [None]:
df

Unnamed: 0,text,output
0,Word is very beautiful,1
1,Everybody loves to travel,1
2,people write travel diaries,0
3,I love going out with friends,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [None]:
bow = cv.fit_transform(df['text'])
bow

<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [None]:
cv.vocabulary_

{'word is': 11,
 'is very': 2,
 'very beautiful': 9,
 'everybody loves': 0,
 'loves to': 4,
 'to travel': 7,
 'people write': 6,
 'write travel': 12,
 'travel diaries': 8,
 'love going': 3,
 'going out': 1,
 'out with': 5,
 'with friends': 10}

In [None]:
bow.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]])

In [None]:
bow[1].toarray()

array([[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]])

In [None]:
# tri grams
cv = CountVectorizer(ngram_range=(3,3))
bow = cv.fit_transform(df['text'])
bow

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
cv.vocabulary_

{'word is very': 7,
 'is very beautiful': 2,
 'everybody loves to': 0,
 'loves to travel': 4,
 'people write travel': 6,
 'write travel diaries': 8,
 'love going out': 3,
 'going out with': 1,
 'out with friends': 5}

## Tf-IDF

In [None]:
df

Unnamed: 0,text,output
0,Word is very beautiful,1
1,Everybody loves to travel,1
2,people write travel diaries,0
3,I love going out with friends,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [None]:
trf = tfid.fit_transform(df['text'])

In [None]:
trf.toarray()

array([[0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.5       , 0.        , 0.5       ,
        0.        ],
       [0.        , 0.        , 0.52547275, 0.        , 0.        ,
        0.        , 0.        , 0.52547275, 0.        , 0.        ,
        0.52547275, 0.41428875, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.52547275, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.52547275,
        0.        , 0.41428875, 0.        , 0.        , 0.        ,
        0.52547275],
       [0.        , 0.        , 0.        , 0.4472136 , 0.4472136 ,
        0.        , 0.4472136 , 0.        , 0.4472136 , 0.        ,
        0.        , 0.        , 0.        , 0.4472136 , 0.        ,
        0.        ]])

In [None]:
tfid.idf_

array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.51082562, 1.91629073, 1.91629073, 1.91629073,
       1.91629073])

## Word2Vec (word embeddings)

In [None]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
tokenized_sentences = [word_tokenize(txt.lower()) for txt in df['text'] ]


In [None]:
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)


In [None]:
model.wv['beautiful']

array([-0.00713902,  0.00124103, -0.00717672, -0.00224462,  0.0037193 ,
        0.00583312,  0.00119818,  0.00210273, -0.00411039,  0.00722533,
       -0.00630704,  0.00464722, -0.00821997,  0.00203647, -0.00497705,
       -0.00424769, -0.00310898,  0.00565521,  0.0057984 , -0.00497465,
        0.00077333, -0.00849578,  0.00780981,  0.00925729, -0.00274233,
        0.00080022,  0.00074665,  0.00547788, -0.00860608,  0.00058446,
        0.00686942,  0.00223159,  0.00112468, -0.00932216,  0.00848237,
       -0.00626413, -0.00299237,  0.00349379, -0.00077263,  0.00141129,
        0.00178199, -0.0068289 , -0.00972481,  0.00904058,  0.00619805,
       -0.00691293,  0.00340348,  0.00020606,  0.00475375, -0.00711994,
        0.00402695,  0.00434743,  0.00995737, -0.00447374, -0.00138926,
       -0.00731732, -0.00969783, -0.00908026, -0.00102275, -0.00650329,
        0.00484973, -0.00616403,  0.00251919,  0.00073944, -0.00339215,
       -0.00097922,  0.00997913,  0.00914589, -0.00446183,  0.00

In [None]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7d89069f7f10>

In [None]:
model.wv.get_normed_vectors().shape


(17, 100)

In [None]:
y = model.wv.index_to_key
len(y)

17

In [None]:
from sklearn.decomposition import PCA


In [None]:
pca = PCA(n_components=3)


In [None]:
X = pca.fit_transform(model.wv.get_normed_vectors())


In [None]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()