# Word Embeddings

This notebook contains no exercises.
It's just for playing with a static embedding! 
We use the GloVe Vectors that are based on wikipedia with 200 dimensions.

In [ ]:
!pip install gensim

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

import gensim.downloader as api

In [None]:
# Loading GloVe Embedding 
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each lengh 200
    """

    wv_from_bin = api.load("glove-wiki-gigaword-200")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin

wv_from_bin = load_embedding_model()

In [None]:
wv_from_bin.most_similar("like")

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def plot_word_embeddings(words, embeddings):
    word_vectors = [embeddings[word] for word in words if word in embeddings]
    pca = PCA(n_components=2)
    word_vectors_2d = pca.fit_transform(word_vectors)
    
    plt.figure(figsize=(10, 10))
    for i, word in enumerate(words):
        if word in embeddings:
            plt.scatter(word_vectors_2d[i, 0], word_vectors_2d[i, 1])
            plt.text(word_vectors_2d[i, 0] + 0.01, word_vectors_2d[i, 1] + 0.01, word, fontsize=9)
    plt.show()

# Example usage
words = ["king", "queen", "man", "woman", "apple", "orange",]  # try more words such as baby, duchess, grape
plot_word_embeddings(words, wv_from_bin)

# Using Word Embeddings for Classification

In [None]:
# Load pre-trained GloVe embeddings
def load_glove_embeddings():
    return wv_from_bin

# Convert text to average embedding
def text_to_embedding(text, embeddings, embedding_dim=200):
    words = text.split()
    word_vectors = [embeddings[word] for word in words if word in embeddings]
    print(len(word_vectors)) 
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0) # build the centroid
    else:
        return np.zeros(embedding_dim)

# Example: Simple dataset
texts = ["i love this movie", "this movie is terrible", "what a fantastic film", "i did not enjoy this movie"]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Load GloVe embeddings
embeddings = load_glove_embeddings()  

# Convert each text to its embedding
X = np.array([text_to_embedding(text, embeddings) for text in texts])

# Train a classifier (e.g., logistic regression)
clf = LogisticRegression()
clf.fit(X, labels)

# Predict on a new example
new_text = ("i love this movie")
new_embedding = text_to_embedding(new_text, embeddings)
prediction = clf.predict([new_embedding])
print("Predicted label:", prediction)