In [3]:
import numpy as np
from gensim.models import keyedvectors

In [4]:
e2v = keyedvectors.load_word2vec_format('Embeddings/emoji2vec.bin', binary=True)

# Number of unique emojis/words in the embedding model
print(f"Vocabulary size: {len(e2v.key_to_index)}")

# List the first 10 emojis/words in the vocabulary
print("First 10 items in the vocabulary:", list(e2v.key_to_index.keys())[:10])

# Check the size of each vector (embedding dimensions)
print(f"Embedding dimensions: {e2v.vector_size}")

# Example: Get the vector for the '😊' emoji
emoji_vector = e2v['😊']
print("Vector for 😊:", emoji_vector)

print(f"Shape of '😊' vector: {emoji_vector.shape}")

# Find the top 5 most similar emojis/words to '😊'
similar_emojis = e2v.most_similar('😊', topn=5)
print("Top 5 most similar items to 😊:", similar_emojis)


Vocabulary size: 1661
First 10 items in the vocabulary: ['🇸🇰', '👔', '🌀', '🚾', '👹', '🚻', '👬', '🇫🇯', '🎧', '🐽']
Embedding dimensions: 300
Vector for 😊: [ 0.06378812 -0.00069821  0.0665964   0.05222901 -0.07232177 -0.00143755
  0.0404273  -0.07334013  0.07281879  0.07685771  0.05053402  0.02341435
 -0.06657571  0.04560458 -0.07893492  0.08308128 -0.01853473  0.08186319
 -0.0172352  -0.07339713  0.0265768   0.07022744  0.05925312 -0.06853098
 -0.00535259 -0.08039748 -0.06379853 -0.01937037  0.07713106 -0.07434632
 -0.02150174 -0.00147013  0.05513644  0.01257754  0.0731611  -0.06669813
  0.08677094  0.06195601  0.01116412  0.03852443  0.01236091 -0.07303166
  0.06670061  0.01267638  0.07811568  0.0065436  -0.05699242  0.07136391
 -0.06265272  0.07253446 -0.08713867  0.06548899  0.06195318  0.01070736
  0.06117714  0.01332144  0.06608845 -0.06562892 -0.0249273   0.01848643
 -0.07909297 -0.05629047 -0.07730547  0.0644474   0.04511802  0.08655617
 -0.07438033 -0.01146777 -0.07839846  0.08235561

In [5]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    
    # Open the GloVe file and read line by line
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")
    return embeddings_index

file_path = 'Embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
glove_embeddings = load_glove_embeddings(file_path)


Loaded 1193514 word vectors from GloVe.
