# One Hot Encodings

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample categorical data
data = np.array([['yellow'], ['Black'], ['Green'], ['Black'], ['yellow']])

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False returns an array

# Fit and transform the data
encoded_data = encoder.fit_transform(data)

In [None]:
encoded_data

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

# Bag of Words (BOW)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({"text":["I love Bangladesh",
                         "Bangladesh love Bangladesh",
                         "I belong here",
                          "Bangladesh beautyful country"],"output":[1,0,0,1]})

df

Unnamed: 0,text,output
0,I love Bangladesh,1
1,Bangladesh love Bangladesh,0
2,I belong here,0
3,Bangladesh beautyful country,1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
bow = cv.fit_transform(df['text'])

In [None]:
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (4, 6)>

In [None]:
bow.toarray()

array([[1, 0, 0, 0, 0, 1],
       [2, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 0]])

In [None]:
#vocabulary
print(cv.vocabulary_)

{'love': 5, 'bangladesh': 0, 'belong': 2, 'here': 4, 'beautyful': 1, 'country': 3}


# N-Grams

In [None]:
df = pd.DataFrame({"text":["I love Bangladesh",
                         "Bangladesh love Bangladesh",
                         "I belong here",
                          "Bangladesh beautyful country"],"output":[1,0,0,1]})

df

Unnamed: 0,text,output
0,I love Bangladesh,1
1,Bangladesh love Bangladesh,0
2,I belong here,0
3,Bangladesh beautyful country,1


In [None]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [None]:
bow = cv.fit_transform(df['text'])

In [None]:
bow.toarray()

array([[0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0]])

In [None]:
#vocabulary
print(cv.vocabulary_)

{'love bangladesh': 4, 'bangladesh love': 1, 'belong here': 3, 'bangladesh beautyful': 0, 'beautyful country': 2}


In [None]:
#Ti gram
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [None]:
bow = cv.fit_transform(df['text'])

In [None]:
bow.toarray()

array([[0, 0],
       [0, 1],
       [0, 0],
       [1, 0]])

In [None]:
print(cv.vocabulary_)

{'bangladesh love bangladesh': 1, 'bangladesh beautyful country': 0}


# TF-IDF (Term frequency- Inverse document frequency)

In [None]:
df = pd.DataFrame({"text":["I love Bangladesh",
                         "Bangladesh love Bangladesh",
                         "I belong here",
                          "Bangladesh beautyful country"],"output":[1,0,0,1]})

df

Unnamed: 0,text,output
0,I love Bangladesh,1
1,Bangladesh love Bangladesh,0
2,I belong here,0
3,Bangladesh beautyful country,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [None]:
arr = tfid.fit_transform(df['text']).toarray()

In [None]:
arr

array([[0.62922751, 0.        , 0.        , 0.        , 0.        ,
        0.77722116],
       [0.8508161 , 0.        , 0.        , 0.        , 0.        ,
        0.52546357],
       [0.        , 0.        , 0.70710678, 0.        , 0.70710678,
        0.        ],
       [0.41137791, 0.64450299, 0.        , 0.64450299, 0.        ,
        0.        ]])

# word2vec

In [None]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import numpy as np
import pandas as pd
import gensim
import os

In [None]:
!pip install --upgrade gensim --user



In [51]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [54]:
story = []
for filename in os.listdir('data'):
    if filename == '.ipynb_checkpoints':
      pass
    f = open(os.path.join('data',filename))
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [55]:
story

[['clash',
  'of',
  'kings',
  'book',
  'two',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'the',
  'comet',
  'tail',
  'spread',
  'across',
  'the',
  'dawn',
  'red',
  'slash',
  'that',
  'bled',
  'above',
  'the',
  'crags',
  'of',
  'dragonstone',
  'like',
  'wound',
  'in',
  'the',
  'pink',
  'and',
  'purple',
  'sky'],
 ['the',
  'maester',
  'stood',
  'on',
  'the',
  'windswept',
  'balcony',
  'outside',
  'his',
  'chambers'],
 ['it', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight'],
 ['their',
  'droppings',
  'speckled',
  'the',
  'gargoyles',
  'that',
  'rose',
  'twelve',
  'feet',
  'tall',
  'on',
  'either',
  'side',
  'of',
  'him',
  'hellhound',
  'and',
  'wyvern',
  'two',
  'of',
  'the',
  'thousand',
  'that',
  'brooded',
  'over',
  'the',
  'walls',
  'of',
  'the',
  'ancient',
  'fortress'],
 ['when',
  'first',
  'he',
  'came',
  'to',
  'dragonstone',
  'the',
  

In [56]:
len(story)

29892

In [57]:
story[0]

['clash',
 'of',
 'kings',
 'book',
 'two',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george',
 'martin',
 'prologue',
 'the',
 'comet',
 'tail',
 'spread',
 'across',
 'the',
 'dawn',
 'red',
 'slash',
 'that',
 'bled',
 'above',
 'the',
 'crags',
 'of',
 'dragonstone',
 'like',
 'wound',
 'in',
 'the',
 'pink',
 'and',
 'purple',
 'sky']

In [58]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [59]:
model.build_vocab(story)

In [60]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(1180426, 1571640)

In [61]:
vec = model.wv.get_normed_vectors()

In [62]:
vec

array([[-0.11935954,  0.02328097,  0.03149038, ..., -0.05529338,
        -0.02647626,  0.06762391],
       [-0.20523356,  0.1473702 ,  0.06811254, ..., -0.09023628,
         0.03168486,  0.1468366 ],
       [-0.01519029,  0.15379545, -0.03380812, ..., -0.12345947,
         0.14173704, -0.11913688],
       ...,
       [-0.08630586,  0.03803461,  0.11513049, ..., -0.1083131 ,
         0.13052364,  0.09394439],
       [-0.0479293 ,  0.06422524, -0.00270144, ..., -0.04190529,
         0.01168695,  0.08724482],
       [-0.13328035,  0.11068989, -0.00173706, ..., -0.07380148,
         0.06556414, -0.00718545]], dtype=float32)

In [None]:
len(vec[0])

100

In [65]:
model.wv.most_similar('daenerys')

[('serving', 0.996230959892273),
 ('beer', 0.9959031343460083),
 ('snorted', 0.9956720471382141),
 ('cook', 0.9955350160598755),
 ('grows', 0.9955131411552429),
 ('prayer', 0.9952750205993652),
 ('boar', 0.9951137900352478),
 ('calling', 0.9949648380279541),
 ('offered', 0.9949499368667603),
 ('soup', 0.9947935342788696)]

In [66]:
model.wv.similarity('arya','sansa')

np.float32(0.93681353)

In [67]:

y = model.wv.index_to_key

In [68]:
y

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'it',
 'in',
 'her',
 'had',
 'she',
 'as',
 'that',
 'with',
 'him',
 'but',
 'not',
 'they',
 'for',
 'on',
 'my',
 'at',
 'have',
 'lord',
 'is',
 'be',
 'said',
 'no',
 'them',
 'me',
 'from',
 'would',
 'were',
 'when',
 'one',
 'ser',
 'your',
 'all',
 'if',
 'so',
 'will',
 'their',
 'could',
 'man',
 'we',
 'there',
 'are',
 'king',
 'out',
 'what',
 'up',
 'this',
 'men',
 'do',
 'did',
 'by',
 'been',
 'tyrion',
 'more',
 'back',
 'than',
 'well',
 'who',
 'or',
 'page',
 'like',
 'only',
 'down',
 'see',
 'never',
 'off',
 'll',
 'even',
 'into',
 'black',
 'now',
 'hand',
 'before',
 'father',
 'thought',
 'old',
 'an',
 'too',
 'jon',
 'made',
 'know',
 'through',
 'some',
 'arya',
 'told',
 'lady',
 'brother',
 'long',
 'how',
 'come',
 'bran',
 'time',
 'eyes',
 'theon',
 'way',
 'where',
 'over',
 'here',
 'has',
 'must',
 'can',
 'stannis',
 'red',
 'then',
 'us',
 'face',
 'boy',
 'head',
 'might',
 'great'

In [69]:
from sklearn.decomposition import PCA

In [70]:
pca = PCA(n_components=3)

In [71]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [74]:
X

array([[-0.41012332,  0.11348319,  0.3009935 ],
       [-0.4254816 , -0.06035334,  0.21923763],
       [ 0.4264154 ,  0.06451273,  0.04941046],
       ...,
       [-0.3153697 ,  0.10264343,  0.16017336],
       [-0.17144746, -0.09819275,  0.2846805 ],
       [ 0.03280326, -0.09030378, -0.02536732]], dtype=float32)

In [75]:
len(X[0])

3

In [76]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()