### One Hot Encoding

In [1]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# Sample categorical data
data = np.array([['Red'], ['Green'], ['Blue'], ['Red'], ['Blue']])
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(data)

In [2]:
encoded_data

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

# Bag of Words (BoW) Example

In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame({
    "text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]
})

In [5]:
df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(df['text'])

In [7]:
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (4, 5)>

In [8]:
bow.toarray()

array([[0, 1, 1, 1, 0],
       [0, 2, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [1, 1, 0, 0, 1]])

In [9]:
#vocabulary
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comment': 0}


In [10]:
# Show feature names (the actual words in the vocabulary)
print("\nFeature Names (columns):")
print(cv.get_feature_names_out())


Feature Names (columns):
['comment' 'dswithbappy' 'people' 'watch' 'write']


# N grams

In [11]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})
df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [12]:
# BI grams
cv = CountVectorizer(ngram_range=(2, 2))

In [13]:
bow = cv.fit_transform(df['text'])

In [14]:
bow.toarray()

array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]])

In [15]:
#vocabulary
print(cv.vocabulary_)

{'people watch': 2, 'watch dswithbappy': 4, 'dswithbappy watch': 0, 'people write': 3, 'write comment': 5, 'dswithbappy write': 1}


In [16]:
#Tri grams
cv = CountVectorizer(ngram_range=(3, 3))
bow = cv.fit_transform(df['text'])

In [17]:
bow.toarray()

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

In [18]:
print(cv.vocabulary_)

{'people watch dswithbappy': 2, 'dswithbappy watch dswithbappy': 0, 'people write comment': 3, 'dswithbappy write comment': 1}


# TF-IDF (Term frequency- Inverse document frequency

In [19]:
f = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()


In [21]:
arr = tfidf_vectorizer.fit_transform(df['text']).toarray()

In [22]:
arr

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

# Word2vec 
### is a popular word embedding technique that represents words as dense vectors in a continuous vector space. It captures semantic relationships between words based on their co-occurrence in a large corpus of text. The two main architectures for Word2vec are Continuous Bag of Words (CBOW) and Skip-gram. CBOW predicts a target word based on its context, while Skip-gram predicts the context given a target word. Word2vec is widely used in natural language processing tasks to capture the meaning and relationships between words.

In [23]:
import pandas as pd 
import numpy as np
import gensim
import os

In [24]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/al-amen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/al-
[nltk_data]     amen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
story = []
for filename in os.listdir('../Day-4'):
    if filename.endswith('.ipynb'):
        pass
    f = open(os.path.join('../Day-4', filename))
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [26]:
story

[['cells',
  'cell_type',
  'markdown',
  'id',
  'metadata',
  'source',
  'one',
  'hot',
  'encoding',
  'cell_type',
  'code',
  'execution_count',
  'id',
  'cd',
  'metadata',
  'outputs',
  'source',
  'from',
  'sklearn',
  'preprocessing',
  'import',
  'onehotencoder',
  'import',
  'numpy',
  'as',
  'np',
  'sample',
  'categorical',
  'data',
  'data',
  'np',
  'array',
  'red',
  'green',
  'blue',
  'red',
  'blue',
  'encoder',
  'onehotencoder',
  'sparse_output',
  'false',
  'encoded_data',
  'encoder',
  'fit_transform',
  'data',
  'cell_type',
  'code',
  'execution_count',
  'id',
  'ba',
  'metadata',
  'outputs',
  'data',
  'text',
  'plain',
  'array'],
 [],
 [],
 [],
 [],
 ['execution_count',
  'metadata',
  'output_type',
  'execute_result',
  'source',
  'encoded_data',
  'cell_type',
  'markdown',
  'id',
  'metadata',
  'source',
  'bag',
  'of',
  'words',
  'bow',
  'example',
  'cell_type',
  'code',
  'execution_count',
  'id',
  'metadata',
  'outp

In [27]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [28]:
model.build_vocab(story)

In [29]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(155848, 219945)

In [30]:
vec = model.wv.get_normed_vectors()

In [31]:
vec

array([[-0.04260727, -0.05766252,  0.08734559, ..., -0.03335273,
         0.11317313,  0.12731355],
       [-0.04219827, -0.06017147,  0.08737712, ..., -0.03184279,
         0.11258925,  0.12683156],
       [-0.04103752, -0.06084938,  0.0843252 , ..., -0.03074553,
         0.11848979,  0.12248601],
       ...,
       [-0.02792632, -0.07426669,  0.09866313, ..., -0.04257097,
         0.12510543,  0.10861216],
       [-0.05139809, -0.06942057,  0.10862347, ..., -0.0436427 ,
         0.09833349,  0.11702304],
       [ 0.02690065,  0.00785755,  0.13945058, ..., -0.10432741,
        -0.05028754,  0.00875823]], shape=(2618, 100), dtype=float32)

In [32]:
len(vec[0])

100

In [33]:
model.wv.most_similar('daenerys')

[('suddenly', 0.9907044172286987),
 ('nltk', 0.9902955889701843),
 ('show', 0.9900597333908081),
 ('name', 0.9898911118507385),
 ('vectorizer', 0.9898048639297485),
 ('story', 0.9897287487983704),
 ('import', 0.9897080659866333),
 ('words', 0.9896770119667053),
 ('red', 0.9896169900894165),
 ('might', 0.9896060228347778)]

In [34]:
model.wv.similarity('arya','sansa')

np.float32(0.9997929)

In [35]:
model.wv.index_to_key

['the',
 'and',
 'to',
 'he',
 'of',
 'his',
 'was',
 'her',
 'you',
 'it',
 'in',
 'that',
 'she',
 'had',
 'him',
 'as',
 'with',
 'said',
 'not',
 'at',
 'for',
 'on',
 'is',
 'but',
 'they',
 'from',
 'my',
 'be',
 'lord',
 'have',
 'your',
 'no',
 'would',
 'all',
 'were',
 'arya',
 'so',
 'them',
 'me',
 'when',
 'there',
 'jon',
 'are',
 'bran',
 'could',
 'ned',
 'sansa',
 'will',
 'this',
 'ser',
 'what',
 'up',
 'now',
 'if',
 'man',
 'one',
 'king',
 'out',
 'like',
 'we',
 'hand',
 'eyes',
 'an',
 'tyrion',
 'here',
 'back',
 'been',
 'boy',
 'their',
 'do',
 'only',
 'page',
 'see',
 'down',
 'or',
 'old',
 'did',
 'even',
 'looked',
 'told',
 'lady',
 'than',
 'who',
 'men',
 'father',
 'know',
 'never',
 'before',
 'then',
 'can',
 'by',
 'too',
 'metadata',
 'into',
 'black',
 'where',
 'time',
 'lannister',
 'face',
 'off',
 'stark',
 'away',
 'littlefinger',
 'brother',
 'maester',
 'watch',
 'execution_count',
 'thought',
 'over',
 'night',
 'long',
 'dany',
 'sword'

In [36]:
from sklearn.decomposition import PCA


In [37]:
pca = PCA(n_components=3)


In [38]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [39]:
X

array([[-0.04292136,  0.02947721,  0.00150714],
       [-0.0443306 ,  0.01921606,  0.00251877],
       [-0.04452515,  0.02356403, -0.00077686],
       ...,
       [-0.03975189, -0.01367018, -0.00722751],
       [-0.03964704, -0.08055898, -0.01563994],
       [ 0.31942618, -0.303855  ,  0.00563314]],
      shape=(2618, 3), dtype=float32)

In [40]:
X

array([[-0.04292136,  0.02947721,  0.00150714],
       [-0.0443306 ,  0.01921606,  0.00251877],
       [-0.04452515,  0.02356403, -0.00077686],
       ...,
       [-0.03975189, -0.01367018, -0.00722751],
       [-0.03964704, -0.08055898, -0.01563994],
       [ 0.31942618, -0.303855  ,  0.00563314]],
      shape=(2618, 3), dtype=float32)

In [41]:
! pip install plotly



In [42]:
import plotly.express as px
fig = px.scatter_3d(x=X[:,0], y=X[:,1], z=X[:,2], text=model.wv.index_to_key)
fig.show()

In [43]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Download NLTK resource
nltk.download('punkt')
nltk.download('stopwords')

# Sample text (can be read from a file too)



[nltk_data] Downloading package punkt to /Users/al-amen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/al-
[nltk_data]     amen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
docs = [
    "John is working on Natural Language Processing.",
    "He was running and eating at the same time.",
    "His friends were working on deep learning models."
]

In [45]:
# Preprocessing function

def preprocess(text):
    # Lower case
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    # stemming 
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Preprocess the documents
cleaned_docs = [preprocess(doc) for doc in docs]

cleaned_docs


['john work natur languag process',
 'run eat time',
 'friend work deep learn model']

In [46]:
# Create Bag of words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned_docs)

In [47]:
# Show vocabulary
print("Vocabulary:", vectorizer.vocabulary_)

Vocabulary: {'john': 3, 'work': 11, 'natur': 7, 'languag': 4, 'process': 8, 'run': 9, 'eat': 1, 'time': 10, 'friend': 2, 'deep': 0, 'learn': 5, 'model': 6}


In [48]:
# Show Bow matrix
print("\nBag of Words Matrix:\n", X.toarray())


Bag of Words Matrix:
 [[0 0 0 1 1 0 0 1 1 0 0 1]
 [0 1 0 0 0 0 0 0 0 1 1 0]
 [1 0 1 0 0 1 1 0 0 0 0 1]]
