In [30]:
import nltk
from nltk.corpus import reuters
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import time

In [31]:
# Download the reuters dataset if not already present
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [32]:
# Prepare the corpus from the NLTK Reuters dataset
def preprocess_reuters():
    """
    Preprocess the Reuters dataset into tokenized sentences for training GloVe.
    Returns:
        sentences (list): A list of tokenized sentences.
    """
    sentences = []
    for file_id in reuters.fileids():
        words = nltk.word_tokenize(reuters.raw(file_id).lower())
        sentences.append(words)
    return sentences

In [33]:
# Function to train GloVe using Word2Vec (CBOW-based approximation)
def train_glove_model(sentences, vector_size=100, window_size=2, min_count=5, epochs=10):
    """
    Train a GloVe-like model using Word2Vec from Gensim.
    Parameters:
        sentences (list): Tokenized sentences from the corpus.
        vector_size (int): Dimensionality of the word embeddings.
        window_size (int): The context window size (default: 2).
        min_count (int): Minimum word frequency to include in the vocabulary.
        epochs (int): Number of training epochs.
    Returns:
        model (Word2Vec): The trained Word2Vec model.
        training_loss (list): List of losses after each epoch.
        training_time (float): Total training time in seconds.
    """
    model = Word2Vec(
        vector_size=vector_size,
        window=window_size,
        min_count=min_count,
        sg=0,  # CBOW model (default behavior of GloVe)
        compute_loss=True  # Enable loss computation
    )
    model.build_vocab(sentences)

    # Record the start time
    start_time = time.time()

    training_loss = []
    for epoch in range(epochs):
        print(f"Training epoch {epoch + 1}/{epochs}...")
        model.train(sentences, total_examples=model.corpus_count, epochs=1)
        loss = model.get_latest_training_loss()
        print(f"Epoch {epoch + 1}, Loss: {loss}")
        training_loss.append(loss)

    # Record the end time
    training_time = time.time() - start_time

    return model, training_loss, training_time

In [34]:
# Preprocess the Reuters dataset
sentences = preprocess_reuters()

In [35]:
# Train the GloVe-like model
glove_model, training_loss, training_time = train_glove_model(sentences, vector_size=100, window_size=2)

# Print training summary
print(f"Total Training Time: {training_time:.2f} seconds")
print(f"Training Loss Per Epoch: {training_loss}")

Training epoch 1/10...
Epoch 1, Loss: 0.0
Training epoch 2/10...
Epoch 2, Loss: 0.0
Training epoch 3/10...
Epoch 3, Loss: 0.0
Training epoch 4/10...
Epoch 4, Loss: 0.0
Training epoch 5/10...
Epoch 5, Loss: 0.0
Training epoch 6/10...
Epoch 6, Loss: 0.0
Training epoch 7/10...
Epoch 7, Loss: 0.0
Training epoch 8/10...
Epoch 8, Loss: 0.0
Training epoch 9/10...
Epoch 9, Loss: 0.0
Training epoch 10/10...
Epoch 10, Loss: 0.0
Total Training Time: 4.98 seconds
Training Loss Per Epoch: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [36]:

from nltk.tokenize import word_tokenize
from collections import Counter



# Extract and preprocess the corpus
def get_corpus():
    """
    Extract tokenized words from the Reuters corpus and preprocess them.
    Returns:
        tokenized_words (list): List of all words in the corpus.
    """
    tokenized_words = []
    for file_id in reuters.fileids():
        words = word_tokenize(reuters.raw(file_id))
        tokenized_words.extend(words)
    return tokenized_words

# Identify nouns in the corpus
def identify_nouns(words):
    """
    Identify proper nouns and common nouns from the word list.
    Parameters:
        words (list): List of words.
    Returns:
        nouns (list): List of potential nouns (words starting with uppercase).
    """
    nouns = [word for word in words if word.istitle()]  # Filter capitalized words
    return nouns

# Get the corpus
corpus_words = get_corpus()

# Count and print a sample of nouns
nouns = identify_nouns(corpus_words)
nouns_count = Counter(nouns)

# Print the top 50 most common nouns
print("Top 50 nouns in the corpus:")
print(nouns_count.most_common(50))


Top 50 nouns in the corpus:
[('The', 10967), ('U.S.', 4919), ('Net', 3378), ('Shr', 3201), ('Inc', 2663), ('Corp', 2336), ('Revs', 2283), ('It', 2059), ('April', 1902), ('March', 1817), ('Bank', 1704), ('He', 1613), ('February', 1574), ('Japan', 1555), ('January', 1549), ('In', 1474), ('Co', 1387), ('A', 1276), ('Ltd', 1179), ('But', 1157), ('Avg', 1062), ('Oper', 1048), ('May', 964), ('Japanese', 890), ('December', 881), ('Sales', 873), ('West', 858), ('United', 835), ('International', 822), ('New', 795), ('American', 792), ('I', 703), ('We', 688), ('June', 670), ('Group', 653), ('States', 640), ('European', 625), ('Commission', 625), ('Department', 622), ('Year', 620), ('This', 593), ('U.S', 579), ('They', 562), ('Federal', 555), ('National', 549), ('Canada', 546), ('Canadian', 537), ('Nine', 527), ('Minister', 525), ('U.K.', 522)]


In [37]:
# Example usage of the trained model
word_vectors = glove_model.wv # holds the trained word vectors (embeddings) learned by the Word2Vec model
print("Vector for 'Canada':", word_vectors['canada'])
print("Most similar to 'Minister':", word_vectors.most_similar('minister'))

Vector for 'Canada': [-0.50394714 -0.907235   -0.32813582  0.6879264  -0.7342884   0.75093186
  0.61239594  0.34096304 -0.31239104 -1.5388498  -0.32393542 -1.669975
  0.8280759  -0.11881369  0.00401857  0.7257953  -0.11328927 -0.0295985
 -0.27693912 -0.5178133   0.64596087 -1.528738   -0.46207717  0.719274
  1.6206697  -0.56574494 -1.6323107  -0.45002523  0.50140965 -0.7647967
 -0.5006929  -0.5909827   0.28333822 -0.5740105  -0.26148933 -0.58785313
 -0.14032222 -0.26196045 -0.13800836  0.6306168   1.0127885  -0.34454647
 -0.02447197  0.11158471 -0.9486377   0.7733709  -0.20020643  0.8107985
  0.9236379   0.563618   -0.25160363 -0.24732193  0.35184932 -1.4933707
 -0.91247404  0.18238029  0.59087384  0.20345269  1.2826673   1.4506941
  0.60126007 -0.81656635 -0.36236653 -0.4306596   0.24310616  1.3210568
 -0.62460846  0.07269536 -0.95956945 -0.5705752  -0.39182952 -0.6005903
  0.56646276 -0.6529269   1.2410403  -0.15492062 -0.08251475 -0.58457595
 -0.47248563 -0.36001197 -0.9090241   0.7

In [38]:
# Use the word_vectors from  trained model
word_vectors = glove_model.wv  


In [39]:
import numpy as np
from scipy.stats import spearmanr
import requests
import csv


In [40]:
# Step 1: Load the dataset
def load_wordsim353(filepath):
    word_pairs = []
    human_scores = []
    with open(filepath, "r") as file:
        reader = csv.reader(file, delimiter="\t")
        for row in reader:
            word1, word2, score = row
            word_pairs.append((word1, word2))
            human_scores.append(float(score))
    return word_pairs, human_scores

In [41]:

# Step 2: Calculate model similarity scores using dot product
def calculate_model_scores(word_pairs, model):
    model_scores = []
    for word1, word2 in word_pairs:
        if word1 in model.key_to_index and word2 in model.key_to_index:
            vector1 = model[word1]
            vector2 = model[word2]
            dot_product = np.dot(vector1, vector2)
            model_scores.append(dot_product)
        else:
            # Assign a low score if one or both words are missing
            model_scores.append(0.0)
    return model_scores



In [44]:
# Step 3: Evaluate Spearman correlation
def evaluate_correlation(human_scores, model_scores):
    correlation, _ = spearmanr(human_scores, model_scores)
    return correlation

# Main Code

word_pairs, human_scores = load_wordsim353("wordsim_similarity_goldstandard.txt")

# Use the  trained  GloVe model
word_vectors = glove_model.wv 
model_scores = calculate_model_scores(word_pairs, word_vectors)
correlation = evaluate_correlation(human_scores, model_scores)


print(f"Spearman Correlation: {correlation:.4f}")


Spearman Correlation: 0.0066
