In [1]:
import nltk
from nltk.corpus import reuters
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# Download the reuters dataset if not already present
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Prepare the corpus from the NLTK Reuters dataset
def preprocess_reuters():
    """
    Preprocess the Reuters dataset into tokenized sentences for training GloVe.
    Returns:
        sentences (list): A list of tokenized sentences.
    """
    sentences = []
    for file_id in reuters.fileids():
        words = nltk.word_tokenize(reuters.raw(file_id).lower())
        sentences.append(words)
    return sentences

In [4]:
# Function to train GloVe using Word2Vec (CBOW-based approximation)
def train_glove_model(sentences, vector_size=100, window_size=2, min_count=5, epochs=10):
    """
    Train a GloVe-like model using Word2Vec from Gensim.
    Parameters:
        sentences (list): Tokenized sentences from the corpus.
        vector_size (int): Dimensionality of the word embeddings.
        window_size (int): The context window size (default: 2).
        min_count (int): Minimum word frequency to include in the vocabulary.
        epochs (int): Number of training epochs.
    Returns:
        model (Word2Vec): The trained Word2Vec model.
        training_loss (list): List of losses after each epoch.
        training_time (float): Total training time in seconds.
    """
    model = Word2Vec(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=0,  # CBOW model (default behavior of GloVe)
        compute_loss=True  # Enable loss computation
    )
    model.build_vocab(sentences)

    # Record the start time
    start_time = time.time()

    training_loss = []
    for epoch in range(epochs):
        print(f"Training epoch {epoch + 1}/{epochs}...")
        model.train(sentences, total_examples=model.corpus_count, epochs=1)
        loss = model.get_latest_training_loss()
        print(f"Epoch {epoch + 1}, Loss: {loss}")
        training_loss.append(loss)

    # Record the end time
    training_time = time.time() - start_time

    return model, training_loss, training_time

In [5]:
# Preprocess the Reuters dataset
sentences = preprocess_reuters()

In [6]:
# Train the GloVe-like model
glove_model, training_loss, training_time = train_glove_model(sentences, vector_size=100, window_size=2)

# Print training summary
print(f"Total Training Time: {training_time:.2f} seconds")
print(f"Training Loss Per Epoch: {training_loss}")

Training epoch 1/10...
Epoch 1, Loss: 0.0
Training epoch 2/10...
Epoch 2, Loss: 0.0
Training epoch 3/10...
Epoch 3, Loss: 0.0
Training epoch 4/10...
Epoch 4, Loss: 0.0
Training epoch 5/10...
Epoch 5, Loss: 0.0
Training epoch 6/10...
Epoch 6, Loss: 0.0
Training epoch 7/10...
Epoch 7, Loss: 0.0
Training epoch 8/10...
Epoch 8, Loss: 0.0
Training epoch 9/10...
Epoch 9, Loss: 0.0
Training epoch 10/10...
Epoch 10, Loss: 0.0
Total Training Time: 5.22 seconds
Training Loss Per Epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [7]:

from nltk.tokenize import word_tokenize
from collections import Counter



# Extract and preprocess the corpus
def get_corpus():
    """
    Extract tokenized words from the Reuters corpus and preprocess them.
    Returns:
        tokenized_words (list): List of all words in the corpus.
    """
    tokenized_words = []
    for file_id in reuters.fileids():
        words = word_tokenize(reuters.raw(file_id))
        tokenized_words.extend(words)
    return tokenized_words

# Identify nouns in the corpus
def identify_nouns(words):
    """
    Identify proper nouns and common nouns from the word list.
    Parameters:
        words (list): List of words.
    Returns:
        nouns (list): List of potential nouns (words starting with uppercase).
    """
    nouns = [word for word in words if word.istitle()]  # Filter capitalized words
    return nouns

# Get the corpus
corpus_words = get_corpus()

# Count and print a sample of nouns
nouns = identify_nouns(corpus_words)
nouns_count = Counter(nouns)

# Print the top 50 most common nouns
print("Top 50 nouns in the corpus:")
print(nouns_count.most_common(50))


Top 50 nouns in the corpus:
[('The', 10967), ('U.S.', 4919), ('Net', 3378), ('Shr', 3201), ('Inc', 2663), ('Corp', 2336), ('Revs', 2283), ('It', 2059), ('April', 1902), ('March', 1817), ('Bank', 1704), ('He', 1613), ('February', 1574), ('Japan', 1555), ('January', 1549), ('In', 1474), ('Co', 1387), ('A', 1276), ('Ltd', 1179), ('But', 1157), ('Avg', 1062), ('Oper', 1048), ('May', 964), ('Japanese', 890), ('December', 881), ('Sales', 873), ('West', 858), ('United', 835), ('International', 822), ('New', 795), ('American', 792), ('I', 703), ('We', 688), ('June', 670), ('Group', 653), ('States', 640), ('European', 625), ('Commission', 625), ('Department', 622), ('Year', 620), ('This', 593), ('U.S', 579), ('They', 562), ('Federal', 555), ('National', 549), ('Canada', 546), ('Canadian', 537), ('Nine', 527), ('Minister', 525), ('U.K.', 522)]


In [8]:
# Example usage of the trained model
word_vectors = glove_model.wv # holds the trained word vectors (embeddings) learned by the Word2Vec model
print("Vector for 'Canada':", word_vectors['canada'])
print("Most similar to 'Minister':", word_vectors.most_similar('minister'))

Vector for 'Canada': [-0.03537919 -1.2833431  -0.66349804  0.05484341 -0.7680373   0.2868829
  0.67401016 -0.28792086 -0.04003252 -1.3796362  -0.6930536  -1.8006237
  0.43856108  0.23824672  0.55158156  0.5020647   0.04388599 -0.8035705
  0.01951286 -0.35358974  0.2619007  -1.4447633  -0.03323268  0.62672305
  1.0495601  -0.72986037 -1.3692279  -0.0326714   0.83780324 -1.0919042
  0.29879636 -0.64934504  0.553407    0.09065519 -0.17620015 -0.3332687
  0.23639782 -0.2513296  -0.22948718  0.49308717  0.6384657  -0.3996525
 -1.1350772   0.51865417  0.14118731  0.29117116  0.13113068  0.00982796
  0.98942065  1.4036468   0.3824025  -0.45662957  0.05803205 -1.208544
 -1.224156   -0.41742542  0.12344661  0.30824795  1.0621655   1.8378152
  0.30115926 -1.1696117   0.206364   -0.9058469  -0.00688341  1.4720991
 -1.3681623   0.16797489 -0.4153084  -0.44217718 -0.1435973  -0.90141994
  0.5362748  -0.6637763   0.8872234   0.6649782   0.47312355  0.5114587
 -0.9459733  -1.0157849  -0.96248007  1.3

In [14]:
# Use the word_vectors from  trained model
word_vectors = glove_model.wv  

from gensim.models import KeyedVectors
# Save the word vectors
word_vectors.save("word_vectors.kv")





In [10]:
import numpy as np
from gensim.models import Word2Vec


In [17]:
def compute_dot_product(query, model, top_n=10):
    """
    Compute the dot product between the input query and all words in the corpus
    and retrieve the top_n most similar contexts.
    """
    # Check if the input is a word or a vector
    if isinstance(query, str):
        # Convert word to vector
        query_vector = model[query]
    else:
        # If query is already a vector, use it directly
        query_vector = query

    # Compute dot product for all vectors in the corpus
    similar_words = []
    for word in model.index_to_key:  # Iterate over all words in the vocabulary
        word_vector = model[word]
        dot_product = np.dot(query_vector, word_vector)
        similar_words.append((word, dot_product))

    # Sort by dot product in descending order and return the top_n results
    similar_words = sorted(similar_words, key=lambda x: x[1], reverse=True)
    return similar_words[:top_n]


In [25]:
# Example Usage
# Load the saved word vectors
word_vectors = KeyedVectors.load("word_vectors.kv")

query_word = "secretary"  # Example query word

# Compute the top 10 most similar words to 'king'
top_similar_words = compute_dot_product(query_word, word_vectors, top_n=10)

# Print the results
print(f"Top 10 most similar words to '{query_word}':")
for word, score in top_similar_words:
    print(f"{word}: {score}")

Top 10 most similar words to 'secretary':
secretary: 120.36280822753906
minister: 95.57017517089844
chairman: 63.91004943847656
director: 61.21501922607422
house: 59.91638946533203
president: 56.692054748535156
james: 56.09077835083008
representative: 52.293128967285156
ministry: 51.10329055786133
department: 50.85832214355469
