# Jupyter Notebook: Comparing Word Embeddings - CBOW, Skip-Gram, FastText

# 📘 Title
# Comparing Word Embedding Models: CBOW, Skip-Gram, and FastText

# In this notebook, we'll train and visualize three popular word embedding models using the Brown corpus:
# - Word2Vec with CBOW
# - Word2Vec with Skip-Gram
# - FastText

# We'll also use PCA to reduce the embeddings to 2D space for easy visualization.

# ------------------------------
# 📦 Import Dependencies and Load Data
# ------------------------------

import nltk
from nltk.corpus import brown
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim.models import Word2Vec, FastText

# Make sure the Brown corpus is available
nltk.download('brown')
sentences = brown.sents()

# ------------------------------
# 🧠 Step 1: Train the Embedding Models
# ------------------------------

# We'll train:
# - A Word2Vec model using CBOW (sg=0)
# - A Word2Vec model using Skip-Gram (sg=1)
# - A FastText model, which includes subword information

def train_models(sentences):
    print("Training Word2Vec (CBOW)...")
    cbow = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=0)

    print("Training Word2Vec (Skip-Gram)...")
    skipgram = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=1)

    print("Training FastText...")
    fasttext = FastText(sentences, vector_size=100, window=5, min_count=2)

    return cbow, skipgram, fasttext

# ------------------------------
# 📊 Step 2: Visualize Word Embeddings
# ------------------------------

# We'll use PCA to reduce the 100-dimensional vectors to 2D, so we can plot them.
# This helps us get a sense of how the models learn relationships between words.

def visualize_embeddings(models, words, titles):
    plt.figure(figsize=(15, 5))

    for i, (name, model) in enumerate(models.items()):
        word_vectors = []
        labels = []

        for word in words:
            if word in model:
                word_vectors.append(model[word])
                labels.append(word)

        pca = PCA(n_components=2)
        reduced = pca.fit_transform(word_vectors)

        plt.subplot(1, 3, i + 1)
        plt.scatter(reduced[:, 0], reduced[:, 1], edgecolors='k')

        for label, x, y in zip(labels, reduced[:, 0], reduced[:, 1]):
            plt.annotate(label, (x, y))

        plt.title(titles[i])
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# ------------------------------
# 🚀 Step 3: Run Everything
# ------------------------------

# Now we’ll:
# - Train all three models
# - Pick some target words
# - Visualize how each model positions those words in vector space

# Train the models
cbow_model, skipgram_model, fasttext_model = train_models(sentences)

# Define a list of words we want to visualize
target_words = ['king', 'queen', 'man', 'woman', 'dog', 'cat', 'car', 'road', 'city', 'village']

# Wrap models into a dictionary for easy visualization
models = {
    'CBOW': cbow_model.wv,
    'Skip-Gram': skipgram_model.wv,
    'FastText': fasttext_model.wv,
}

# Titles for plots
titles = ['Word2Vec CBOW', 'Word2Vec Skip-Gram', 'FastText']

# Visualize the word embeddings
visualize_embeddings(models, target_words, titles)
