In [1]:
import gensim.downloader as api
import numpy as np
import re
from numpy.linalg import norm

In [2]:
model = api.load("glove-wiki-gigaword-100")  # ~128MB


In [5]:
def sentence_vector(sentence, model):
    """
    Computes the average Word2Vec vector for a given sentence.
    
    Parameters:
    - sentence: list of words (tokens)
    - model: gensim Word2Vec or KeyedVectors
    
    Returns:
    - np.array: averaged vector
    """
    vectors = [model[word] for word in sentence if word in model.key_to_index]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [7]:
def preprocess(text):
    # Basic preprocessing: lowercase + simple tokenisation
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

In [11]:
sentence1 = "The queen reigns over the nation."
sentence2 = "Her Majesty leads the nation."
tokens1 = preprocess(sentence1)
vec1 = sentence_vector(tokens1, model)
tokens2 = preprocess(sentence2)
vec2 = sentence_vector(tokens2, model)

# # Cosine similarity between the two sentence vectors


cos_sim = np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))
print(f"Cosine similarity between the two sentences: {cos_sim:.4f}")


Cosine similarity between the two sentences: 0.9280


In [4]:
print(model.most_similar('king'))

[('prince', 0.7682329416275024), ('queen', 0.7507689595222473), ('son', 0.7020888328552246), ('brother', 0.6985775828361511), ('monarch', 0.6977890729904175), ('throne', 0.691999077796936), ('kingdom', 0.6811410188674927), ('father', 0.6802029013633728), ('emperor', 0.6712858080863953), ('ii', 0.6676074266433716)]


In [5]:
# Analogy test: king - man + woman = ?
result = model.most_similar(positive=['king', 'woman'], negative=['man'])

# Print top result
print("king - man + woman =", result[0])


king - man + woman = ('queen', 0.7698541283607483)
