In [23]:
import numpy as np
import random


In [24]:
from sentence_transformers import SentenceTransformer
import numpy as np





In [25]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [26]:
embedding_dim = 100  # matches GloVe file
dataset_path = r"C:\Users\abiav\embedding vector\male-female.txt"
glove_file = r"C:\Users\abiav\embedding vector\glove.6B.100d.txt"


In [27]:
def load_glove(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0].lower()  # lowercase for consistency
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

print("Loading GloVe embeddings...")
embeddings = load_glove(glove_file)
print(f"Total GloVe words: {len(embeddings)}")

Loading GloVe embeddings...
Total GloVe words: 400000


In [28]:
analogy_pairs = []
with open(dataset_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            analogy_pairs.append((parts[0].lower(), parts[1].lower()))

print(f"Total word pairs in dataset: {len(analogy_pairs)}")


all_words = set([w for pair in analogy_pairs for w in pair if w in embeddings])
word_to_idx = {word: idx for idx, word in enumerate(all_words)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

embedding_matrix = np.array([embeddings[word] for word in all_words])
embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1, keepdims=True)

Total word pairs in dataset: 450


In [29]:
def solve_analogy(word_a, word_b, word_c, top_n=1):
    """Solve analogy: word_a is to word_b as word_c is to ?"""
    for w in [word_a, word_b, word_c]:
        if w not in word_to_idx:
            return None 
            
    vec_a = embedding_matrix[word_to_idx[word_a]]
    vec_b = embedding_matrix[word_to_idx[word_b]]
    vec_c = embedding_matrix[word_to_idx[word_c]]

    target_vec = vec_b - vec_a + vec_c
    target_vec /= np.linalg.norm(target_vec)

    similarities = embedding_matrix @ target_vec
    # exclude input words
    for w in [word_a, word_b, word_c]:
        similarities[word_to_idx[w]] = -np.inf

    top_idx = np.argmax(similarities)
    return idx_to_word[top_idx]


In [32]:
for w1, w2 in analogy_pairs:
    if w1 not in embeddings:
        embeddings[w1] = np.random.uniform(-1, 1, embedding_dim)
    if w2 not in embeddings:
        embeddings[w2] = np.random.uniform(-1, 1, embedding_dim)

    vec1 = embeddings[w1]
    vec2 = embeddings[w2]
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    print(f"{w1}-{w2} cosine similarity: {cos_sim:.3f}")


actor-actress cosine similarity: 0.858
batman-batwoman cosine similarity: 0.441
boar-sow cosine similarity: 0.274
boy-girl cosine similarity: 0.918
brother-sister cosine similarity: 0.743
buck-doe cosine similarity: 0.249
bull-cow cosine similarity: 0.525
businessman-businesswoman cosine similarity: 0.666
chairman-chairwoman cosine similarity: 0.679
dad-mom/mum cosine similarity: 0.063
daddy-mommy/mother/mom cosine similarity: 0.078
duke-duchess cosine similarity: 0.675
emperor-empress cosine similarity: 0.735
father-mother cosine similarity: 0.866
fisherman-fisherwoman cosine similarity: 0.283
fox-vixen cosine similarity: 0.126
gentleman-lady/gentlewoman/madam cosine similarity: -0.035
god-goddess cosine similarity: 0.619
grandfather-grandmother cosine similarity: 0.803
grandpa-grandma cosine similarity: 0.810
grandson-granddaughter cosine similarity: 0.797
groom-bride cosine similarity: 0.792
headmaster-headmistress cosine similarity: 0.737
heir-heiress cosine similarity: 0.653
hero-

In [33]:
embedding_dim = 100
dataset_path = r"C:\Users\abiav\embedding vector\E10 [male - female].txt"
glove_file = r"C:\Users\abiav\embedding vector\glove.6B.100d.txt"


In [34]:
def load_glove(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0].lower()
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

embeddings = load_glove(glove_file)

In [35]:
analogy_pairs = []
with open(dataset_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            analogy_pairs.append((parts[0].lower(), parts[1].lower()))


In [36]:
female_words = [w2 for _, w2 in analogy_pairs if w2 in embeddings]

In [37]:
def get_vector(word):
    parts = word.split('/')  
    vectors = [embeddings[p] for p in parts if p in embeddings]
    if vectors:
        vec = np.mean(vectors, axis=0)
        return vec / np.linalg.norm(vec)
    return None

In [38]:
correct = 0
total = 0

for w1, w2 in analogy_pairs:
    vec1 = get_vector(w1)
    vec2 = get_vector(w2)
    if vec1 is None or vec2 is None:
        continue

    # Compute cosine similarity with all female words
    sims = []
    for f in female_words:
        f_vec = get_vector(f)
        if f_vec is not None:
            sims.append((f, np.dot(vec1, f_vec)))
    if not sims:
        continue

    # Pick female word with max similarity
    pred_word, _ = max(sims, key=lambda x: x[1])

    if pred_word == w2:
        correct += 1
    total += 1


In [39]:
accuracy = correct / total if total > 0 else 0
print(f"Male-Female Analogy Accuracy: {accuracy:.2f}")

Male-Female Analogy Accuracy: 0.59
