In [6]:
import numpy as np

def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  
            vector = np.asarray(values[1:], dtype='float32') 
            embeddings[word] = vector
    return embeddings

glove_path = 'glove.6B/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_path)
print(glove_embeddings.get('king'))  # Replace 'king' with any word to test


[ 0.0033901 -0.34614    0.28144    0.48382    0.59469    0.012965
  0.53982    0.48233    0.21463   -1.0249    -0.34788   -0.79001
 -0.15084    0.61374    0.042811   0.19323    0.25462    0.32528
  0.05698    0.063253  -0.49439    0.47337   -0.16761    0.045594
  0.30451   -0.35416   -0.34583   -0.20118    0.25511    0.091111
  0.014651  -0.017541  -0.23854    0.48215   -0.9145    -0.36235
  0.34736    0.028639  -0.027065  -0.036481  -0.067391  -0.23452
 -0.13772    0.33951    0.13415   -0.1342     0.47856   -0.1842
  0.10705   -0.45834   -0.36085   -0.22595    0.32881   -0.13643
  0.23128    0.34269    0.42344    0.47057    0.479      0.074639
  0.3344     0.10714   -0.13289    0.58734    0.38616   -0.52238
 -0.22028   -0.072322   0.32269    0.44226   -0.037382   0.18324
  0.058082   0.26938    0.36202    0.13983    0.016815  -0.34426
  0.4827     0.2108     0.75618   -0.13092   -0.025741   0.43391
  0.33893   -0.16438    0.26817    0.68774    0.311     -0.2509
  0.0027749 -0.39809   

In [9]:
import csv

def load_simlex(filepath):
    simlex_data = []
    with open(filepath, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)  # Skip header
        for row in reader:
            word1, word2, similarity = row[0], row[1], float(row[3])
            simlex_data.append((word1, word2, similarity))
    return simlex_data
simlex_path = 'SimLex-999/SimLex-999/SimLex-999.txt'  # Replace with your file path
simlex_data = load_simlex(simlex_path)


In [13]:
import numpy as np

def cosine_similarity(vec1, vec2):
    # Convert lists to numpy arrays for easier vector operations
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    
    # Compute the dot product of the two vectors
    dot_product = np.dot(vec1, vec2)
    
    # Compute the magnitude (norm) of each vector
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    # Compute and return the cosine similarity
    if norm_vec1 > 0 and norm_vec2 > 0:
        return dot_product / (norm_vec1 * norm_vec2)
    else:
        return None  # Avoid division by zero


In [17]:
def spearman_rank_correlation(x, y):
    # Step 1: Rank the values
    rank_x = {val: rank for rank, val in enumerate(sorted(x), start=1)}
    rank_y = {val: rank for rank, val in enumerate(sorted(y), start=1)}
    
    # Step 2: Compute the rank differences (d_i) and d_i^2
    rank_diff = [(rank_x[val1] - rank_y[val2])**2 for val1, val2 in zip(x, y)]
    
    # Step 3: Calculate the Spearman's rank correlation coefficient
    n = len(x)
    rho = 1 - (6 * sum(rank_diff)) / (n * (n**2 - 1))
    return rho

def evaluate_embeddings(simlex_data, embeddings):
    predicted_similarities = []
    human_similarities = []
    similarity_scores = []
    word_pairs = []
    
    for word1, word2, human_score in simlex_data:
        vec1 = embeddings.get(word1)
        vec2 = embeddings.get(word2)
        similarity = cosine_similarity(vec1, vec2)
        
        if similarity is not None:  
            predicted_similarities.append(similarity)
            human_similarities.append(human_score)
            similarity_scores.append(similarity)
            word_pairs.append((word1, word2))
        top_5_pairs = sorted(zip(similarity_scores, word_pairs), reverse=True)[:5]
    
    print("Top 5 word pairs with highest similarity:")
    for similarity, (word1, word2) in top_5_pairs:
        print(f"{word1} - {word2}: {similarity:.4f}")
    
    spearman_corr = spearman_rank_correlation(predicted_similarities, human_similarities)
    return spearman_corr
spearman_corr = evaluate_embeddings(simlex_data, glove_embeddings)
print(f"Spearman Correlation: {spearman_corr}")


Top 5 word pairs with highest similarity:
wife - husband: 0.8646
movie - film: 0.8589
son - father: 0.8563
brother - son: 0.8323
father - brother: 0.8206
Spearman Correlation: 0.3707401409425457
