# One Hot Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample categorical Data
data = np.array([['red'], ['blue'], ['green'], ['red'], ['blue']])

# Initializa the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False returns an array

# Fit and transform the data
encoded_data = encoder.fit_transform(data)

In [8]:
encoded_data

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

# Bag of Words

In [9]:
import numpy as np
import pandas as pd

In [10]:
df = pd.DataFrame({"text":["audience follow celebrity",
                         "celebrity follow celebrity",
                         "audience share photo",
                          "celebrity share photo"],"output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,audience follow celebrity,1
1,celebrity follow celebrity,1
2,audience share photo,0
3,celebrity share photo,0


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [13]:
bow = cv.fit_transform(df['text'])
bow.toarray()

array([[1, 1, 1, 0, 0],
       [0, 2, 1, 0, 0],
       [1, 0, 0, 1, 1],
       [0, 1, 0, 1, 1]])

In [14]:
#vocabulary
print(cv.vocabulary_)

{'audience': 0, 'follow': 2, 'celebrity': 1, 'share': 4, 'photo': 3}


# N-grams

In [15]:
df = pd.DataFrame({"text":["audience follow celebrity",
                         "celebrity follow celebrity",
                         "audience share photo",
                          "celebrity share photo"],"output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,audience follow celebrity,1
1,celebrity follow celebrity,1
2,audience share photo,0
3,celebrity share photo,0


In [16]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [19]:
bow = cv.fit_transform(df['text'])
bow.toarray()

array([[1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1]])

In [20]:
print(cv.vocabulary_)

{'audience follow': 0, 'follow celebrity': 4, 'celebrity follow': 2, 'audience share': 1, 'share photo': 5, 'celebrity share': 3}


# TF-IDF (Term Frequency- Inverse Document Frequency)

In [21]:
df = pd.DataFrame({"text":["audience follow celebrity",
                         "celebrity follow celebrity",
                         "audience share photo",
                          "celebrity share photo"],"output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,audience follow celebrity,1
1,celebrity follow celebrity,1
2,audience share photo,0
3,celebrity share photo,0


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [24]:
arr = tfid.fit_transform(df['text']).toarray()
arr

array([[0.61366674, 0.49681612, 0.61366674, 0.        , 0.        ],
       [0.        , 0.8508161 , 0.52546357, 0.        , 0.        ],
       [0.57735027, 0.        , 0.        , 0.57735027, 0.57735027],
       [0.        , 0.49681612, 0.        , 0.61366674, 0.61366674]])

In [26]:
print(tfid.idf_)

[1.51082562 1.22314355 1.51082562 1.51082562 1.51082562]


# Word2Vec

In [29]:
!pip install --upgrade gensim --user

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m482.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━

In [12]:
import numpy as np
import pandas as pd
import gensim
import os

In [13]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
story = []

for filename in os.listdir('data'):
    filepath = os.path.join('data', filename)

    if os.path.isdir(filepath):
        continue  # Skip directories

    with open(filepath, 'r', encoding='utf-8') as f:
        corpus = f.read()

    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [16]:
story

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [17]:
len(story)

27244

In [18]:
story[0]

['game',
 'of',
 'thrones',
 'book',
 'one',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george',
 'martin',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them']

In [19]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [20]:
model.build_vocab(story)

In [21]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(1059571, 1423500)

In [22]:
vec = model.wv.get_normed_vectors()

In [23]:
vec

array([[-0.08643934,  0.06994653, -0.01253685, ..., -0.19151807,
        -0.02535286,  0.08938872],
       [-0.06188913,  0.0939763 , -0.03459288, ..., -0.21601227,
         0.05195586,  0.02350505],
       [-0.17459165,  0.00029774,  0.07467104, ..., -0.13535239,
         0.08590326, -0.20146635],
       ...,
       [-0.09065288, -0.03642027,  0.07845945, ..., -0.15467349,
         0.06064761, -0.11165001],
       [-0.03120412,  0.03349888,  0.00942222, ..., -0.17547892,
         0.01565282, -0.05039285],
       [ 0.01494384,  0.10297679, -0.02562408, ..., -0.1784334 ,
         0.05675905, -0.03599986]], dtype=float32)

In [25]:
len(vec[0])

100

In [26]:
model.wv.most_similar('daenerys')

[('pie', 0.9965146780014038),
 ('twice', 0.9959302544593811),
 ('tarly', 0.9958934783935547),
 ('making', 0.9957940578460693),
 ('supper', 0.9956756234169006),
 ('samwell', 0.9956434965133667),
 ('butcher', 0.9954516291618347),
 ('moreo', 0.995277464389801),
 ('meal', 0.9952464699745178),
 ('marillion', 0.9952335357666016)]

In [28]:
model.wv.similarity('arya','sansa')

0.9764371

In [36]:
y = model.wv.index_to_key

Dimension Reduce

In [29]:
from sklearn.decomposition import PCA

In [30]:
# convert 3 Dimension
pca = PCA(n_components=3)

In [31]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [32]:
X

array([[ 0.38669533, -0.17390645,  0.3140823 ],
       [ 0.45622742, -0.03387862,  0.22372657],
       [-0.44555745, -0.1442858 ,  0.02286518],
       ...,
       [-0.15285607,  0.02245408, -0.07055733],
       [ 0.24794695,  0.0582065 ,  0.04359367],
       [ 0.38502485, -0.02907932,  0.00832066]], dtype=float32)

In [33]:
X[0]

array([ 0.38669533, -0.17390645,  0.3140823 ], dtype=float32)

In [34]:
len(X[0])

3

In [37]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()