In [10]:
import nltk
from nltk.corpus import reuters
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import time

In [11]:
# Download the reuters dataset if not already present
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Prepare the corpus from the NLTK Reuters dataset
def preprocess_reuters():
    """
    Preprocess the Reuters dataset into tokenized sentences for training GloVe.
    Returns:
        sentences (list): A list of tokenized sentences.
    """
    sentences = []
    for file_id in reuters.fileids():
        words = nltk.word_tokenize(reuters.raw(file_id).lower())
        sentences.append(words)
    return sentences

In [13]:
# Function to train GloVe using Word2Vec (CBOW-based approximation)
def train_glove_model(sentences, vector_size=100, window_size=2, min_count=5, epochs=10):
    """
    Train a GloVe-like model using Word2Vec from Gensim.
    Parameters:
        sentences (list): Tokenized sentences from the corpus.
        vector_size (int): Dimensionality of the word embeddings.
        window_size (int): The context window size (default: 2).
        min_count (int): Minimum word frequency to include in the vocabulary.
        epochs (int): Number of training epochs.
    Returns:
        model (Word2Vec): The trained Word2Vec model.
        training_loss (list): List of losses after each epoch.
        training_time (float): Total training time in seconds.
    """
    model = Word2Vec(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=0,  # CBOW model (default behavior of GloVe)
        compute_loss=True  # Enable loss computation
    )
    model.build_vocab(sentences)

    # Record the start time
    start_time = time.time()

    training_loss = []
    for epoch in range(epochs):
        print(f"Training epoch {epoch + 1}/{epochs}...")
        model.train(sentences, total_examples=model.corpus_count, epochs=1)
        loss = model.get_latest_training_loss()
        print(f"Epoch {epoch + 1}, Loss: {loss}")
        training_loss.append(loss)

    # Record the end time
    training_time = time.time() - start_time

    return model, training_loss, training_time

In [14]:
# Preprocess the Reuters dataset
sentences = preprocess_reuters()

In [15]:
# Train the GloVe-like model
glove_model, training_loss, training_time = train_glove_model(sentences, vector_size=100, window_size=2)

# Print training summary
print(f"Total Training Time: {training_time:.2f} seconds")
print(f"Training Loss Per Epoch: {training_loss}")

Training epoch 1/10...
Epoch 1, Loss: 0.0
Training epoch 2/10...
Epoch 2, Loss: 0.0
Training epoch 3/10...
Epoch 3, Loss: 0.0
Training epoch 4/10...
Epoch 4, Loss: 0.0
Training epoch 5/10...
Epoch 5, Loss: 0.0
Training epoch 6/10...
Epoch 6, Loss: 0.0
Training epoch 7/10...
Epoch 7, Loss: 0.0
Training epoch 8/10...
Epoch 8, Loss: 0.0
Training epoch 9/10...
Epoch 9, Loss: 0.0
Training epoch 10/10...
Epoch 10, Loss: 0.0
Total Training Time: 5.85 seconds
Training Loss Per Epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:

from nltk.tokenize import word_tokenize
from collections import Counter



# Extract and preprocess the corpus
def get_corpus():
    """
    Extract tokenized words from the Reuters corpus and preprocess them.
    Returns:
        tokenized_words (list): List of all words in the corpus.
    """
    tokenized_words = []
    for file_id in reuters.fileids():
        words = word_tokenize(reuters.raw(file_id))
        tokenized_words.extend(words)
    return tokenized_words

# Identify nouns in the corpus
def identify_nouns(words):
    """
    Identify proper nouns and common nouns from the word list.
    Parameters:
        words (list): List of words.
    Returns:
        nouns (list): List of potential nouns (words starting with uppercase).
    """
    nouns = [word for word in words if word.istitle()]  # Filter capitalized words
    return nouns

# Get the corpus
corpus_words = get_corpus()

# Count and print a sample of nouns
nouns = identify_nouns(corpus_words)
nouns_count = Counter(nouns)

# Print the top 50 most common nouns
print("Top 50 nouns in the corpus:")
print(nouns_count.most_common(50))


[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 50 nouns in the corpus:
[('The', 10967), ('U.S.', 4919), ('Net', 3378), ('Shr', 3201), ('Inc', 2663), ('Corp', 2336), ('Revs', 2283), ('It', 2059), ('April', 1902), ('March', 1817), ('Bank', 1704), ('He', 1613), ('February', 1574), ('Japan', 1555), ('January', 1549), ('In', 1474), ('Co', 1387), ('A', 1276), ('Ltd', 1179), ('But', 1157), ('Avg', 1062), ('Oper', 1048), ('May', 964), ('Japanese', 890), ('December', 881), ('Sales', 873), ('West', 858), ('United', 835), ('International', 822), ('New', 795), ('American', 792), ('I', 703), ('We', 688), ('June', 670), ('Group', 653), ('States', 640), ('European', 625), ('Commission', 625), ('Department', 622), ('Year', 620), ('This', 593), ('U.S', 579), ('They', 562), ('Federal', 555), ('National', 549), ('Canada', 546), ('Canadian', 537), ('Nine', 527), ('Minister', 525), ('U.K.', 522)]


In [21]:
# Example usage of the trained model
word_vectors = glove_model.wv # holds the trained word vectors (embeddings) learned by the Word2Vec model
print("Vector for 'Canada':", word_vectors['canada'])
print("Most similar to 'Minister':", word_vectors.most_similar('minister'))

Vector for 'Canada': [ 0.4262926  -1.2423606  -0.61886513  0.23888387 -0.0798335   0.6545212
  1.1627374   0.25314656  0.21703158 -1.6847712  -0.8597218  -1.535619
  1.060118    0.26586792 -0.15536933  0.40483266  0.29103097  0.05220363
  0.3548324  -0.14567353  0.6206696  -1.0312492   0.04680159  0.98125803
  1.1642821  -0.13911815 -1.1806442  -0.22721769  0.9707035  -0.72080296
  0.00381474 -0.18983811  0.5630053   0.00452598 -0.18025659 -0.6592921
  0.52549076  0.20244564 -0.36795744  0.6444692   0.8270927   0.22629467
 -0.9919469  -0.02519848 -0.06340182  0.7273546  -0.10364688 -0.00286774
  0.49039426  0.58488     0.09937352 -1.5187933   0.38697448 -0.7598813
 -1.2156956  -0.14334778  0.11882898  0.36145663  1.4550567   1.4228486
 -0.01983459 -0.98798203  0.07294873 -0.9175459  -0.5089837   1.469861
 -0.86075413 -0.34533402 -1.4331465  -0.31300193  0.05041151 -0.6796521
 -0.19805384 -1.0174718   1.0101024  -0.23672527  0.39137456 -0.02432116
 -0.8364139  -0.45259434 -0.89181364  1