# **Homework 11 - Word Embedding**
# KDD Tuesdays 12:30 PM - 2:45 PM
## Jake Brulato

In [1]:
import io
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

2024-04-09 11:18:31.119797: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# **The Sentence we will be embedding**

In [2]:
sentence = 'I love deep learning and text mining'
tokens = list(sentence.lower().split())
print(len(tokens))

7


# **Create Vocab Index**

In [3]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'i': 1, 'love': 2, 'deep': 3, 'learning': 4, 'and': 5, 'text': 6, 'mining': 7}


# **Inverse Vocab Index**

In [4]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'i', 2: 'love', 3: 'deep', 4: 'learning', 5: 'and', 6: 'text', 7: 'mining'}


# **Sequence of the sentence**

In [5]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7]


# **Download Glove**

In [10]:
import os

# Define the URL and file path
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
file_path = 'glove.6B.zip'

# Download the file
os.system(f'wget {url} -O {file_path}')

# Unzip the file
os.system(f'unzip {file_path}')


sh: wget: command not found


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


0

# **Create Word list from Glove.6B.50D**

In [11]:
word_list = []
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        word = line.split()[0]
        word_list.append(word)


# **Define Dimentions and embedding for the word with Tensor**

In [12]:
embedding_index = {}
with open('glove.6B.50d.txt', encoding = 'utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# **Embeddings for the word text and deep**

In [13]:
embedding_index['deep']

array([ 0.31445 ,  1.2024  ,  0.066651, -0.20096 , -0.049636,  0.66882 ,
       -0.049386,  0.44174 ,  0.1799  , -0.10196 , -0.43674 ,  0.12076 ,
       -0.12495 ,  0.43378 , -0.87784 ,  0.010281,  0.54592 , -0.28928 ,
       -0.46115 , -0.32058 , -0.69094 ,  0.49733 ,  0.40657 , -0.90062 ,
        0.69699 , -1.1536  , -0.12229 ,  1.0657  ,  0.93207 ,  0.20439 ,
        3.3004  ,  0.14223 ,  0.46493 ,  0.075359, -0.56755 ,  0.30769 ,
       -1.1251  , -0.37871 ,  0.57479 , -0.12629 ,  0.13589 ,  0.10633 ,
        0.058432,  0.40321 ,  0.10243 ,  0.12004 ,  0.41383 ,  0.051987,
       -0.5835  , -1.1159  ], dtype=float32)

In [14]:
embedding_index['text']

array([ 0.32615  ,  0.36686  , -0.0074905, -0.37553  ,  0.66715  ,
        0.21646  , -0.19801  , -1.1001   , -0.42221  ,  0.10574  ,
       -0.31292  ,  0.50953  ,  0.55775  ,  0.12019  ,  0.31441  ,
       -0.25043  , -1.0637   , -1.3213   ,  0.87798  , -0.24627  ,
        0.27379  , -0.51092  ,  0.49324  ,  0.52243  ,  1.1636   ,
       -0.75323  , -0.48053  , -0.11259  , -0.54595  , -0.83921  ,
        2.9825   , -1.1916   , -0.51958  , -0.39365  , -0.1419   ,
       -0.026977 ,  0.66296  ,  0.16574  , -1.1681   ,  0.14443  ,
        1.6305   , -0.17216  , -0.17436  , -0.01049  , -0.17794  ,
        0.93076  ,  1.0381   ,  0.94266  , -0.14805  , -0.61109  ],
      dtype=float32)

# **Embedding Matrix**

In [15]:
embedding_matrix = np.zeros((vocab_size+1, 50))
for word, i in inverse_vocab.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# **Consolidated Embeddings for the word text and deep**

In [16]:
def load_glove_embeddings(file_path):
    """
    Load GloVe embeddings from a file into a dictionary.

    Args:
    - file_path: Path to the GloVe embeddings file.

    Returns:
    - A dictionary where keys are words and values are embeddings.
    """
    embeddings_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = [float(val) for val in values[1:]]
            embeddings_dict[word] = vector
    return embeddings_dict

def get_embeddings_for_sentence(sentence, embeddings_dict):
    """
    Get embeddings for each word in a sentence.

    Args:
    - sentence: A string containing multiple words.
    - embeddings_dict: A dictionary of word embeddings.

    Returns:
    - A dictionary of embeddings for the sentence's words.
    """
    sentence_embeddings = {}
    for word in sentence.split():
        # Here, we convert words to lowercase to match the GloVe's casing
        word_embedding = embeddings_dict.get(word.lower())
        if word_embedding is not None:
            sentence_embeddings[word] = word_embedding
    return sentence_embeddings

# Path to the GloVe embeddings file (adjust as necessary)
glove_file_path = 'glove.6B.50d.txt'

# Load the GloVe embeddings
embeddings_dict = load_glove_embeddings(glove_file_path)

# Sentence for which to get embeddings
sentence = "I love deep learning and text mining"

# Get embeddings for the sentence
sentence_embeddings = get_embeddings_for_sentence(sentence, embeddings_dict)

# For demonstration, let's print the embeddings for "deep" and "text"
print("Embedding for 'deep':", sentence_embeddings.get("deep"))
print("# of Dimensions for 'deep':", len(sentence_embeddings.get("deep")))
print("Embedding for 'text':", sentence_embeddings.get("text"))
print("# of Dimensions for 'text':", len(sentence_embeddings.get("text")))


Embedding for 'deep': [0.31445, 1.2024, 0.066651, -0.20096, -0.049636, 0.66882, -0.049386, 0.44174, 0.1799, -0.10196, -0.43674, 0.12076, -0.12495, 0.43378, -0.87784, 0.010281, 0.54592, -0.28928, -0.46115, -0.32058, -0.69094, 0.49733, 0.40657, -0.90062, 0.69699, -1.1536, -0.12229, 1.0657, 0.93207, 0.20439, 3.3004, 0.14223, 0.46493, 0.075359, -0.56755, 0.30769, -1.1251, -0.37871, 0.57479, -0.12629, 0.13589, 0.10633, 0.058432, 0.40321, 0.10243, 0.12004, 0.41383, 0.051987, -0.5835, -1.1159]
# of Dimensions for 'deep': 50
Embedding for 'text': [0.32615, 0.36686, -0.0074905, -0.37553, 0.66715, 0.21646, -0.19801, -1.1001, -0.42221, 0.10574, -0.31292, 0.50953, 0.55775, 0.12019, 0.31441, -0.25043, -1.0637, -1.3213, 0.87798, -0.24627, 0.27379, -0.51092, 0.49324, 0.52243, 1.1636, -0.75323, -0.48053, -0.11259, -0.54595, -0.83921, 2.9825, -1.1916, -0.51958, -0.39365, -0.1419, -0.026977, 0.66296, 0.16574, -1.1681, 0.14443, 1.6305, -0.17216, -0.17436, -0.01049, -0.17794, 0.93076, 1.0381, 0.94266, -0.