### Prepare Data

In [10]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt


In [11]:
with open("../input/glove-global-vectors-for-word-representation/glove.6B.50d.txt") as file:
    data = file.readlines()

In [12]:
len(data)

400000

In [13]:
for i in range(len(data)):
    data[i] = data[i][:-1]

In [14]:
data_dict = dict()

for i in range(len(data)):
    split_data = data[i].split()
    data_dict[split_data[0]] = np.array(split_data[1:]).astype('float64')

In [15]:
data_dict["the"]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

### Cosine Similarity Example

In [16]:
def cosine_similarity(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [17]:
table = data_dict["table"]
desk = data_dict["desk"]
football = data_dict["football"]
baseball = data_dict["baseball"]
water = data_dict["water"]
fire = data_dict["fire"]
computer = data_dict["computer"]
calculator = data_dict["calculator"]
number = data_dict["number"]
math = data_dict["math"]
boy = data_dict["boy"]
girl = data_dict["girl"]
sad = data_dict["sad"]
happy = data_dict["happy"]
good = data_dict["good"]
bad = data_dict["bad"]
turkey = data_dict["turkey"]
television = data_dict["television"]
awesome = data_dict["awesome"]
great = data_dict["great"]
coffee = data_dict["coffee"]
giraffe = data_dict["giraffe"]
cat = data_dict["cat"]
barcelona = data_dict["barcelona"]
school = data_dict["school"]
disaster = data_dict["disaster"]

print(f"Cosine similarity for pair (table, desk) = {cosine_similarity(table, desk)}")
print(f"Cosine similarity for pair (football, baseball) = {cosine_similarity(football, baseball)}")
print(f"Cosine similarity for pair (water, fire) = {cosine_similarity(water, fire)}")
print(f"Cosine similarity for pair (computer, calculator) = {cosine_similarity(computer, calculator)}")
print(f"Cosine similarity for pair (number, math) = {cosine_similarity(number, math)}")
print(f"Cosine similarity for pair (boy, girl) = {cosine_similarity(boy, girl)}")
print(f"Cosine similarity for pair (sad, happy) = {cosine_similarity(sad, happy)}")
print(f"Cosine similarity for pair (good, bad) = {cosine_similarity(good, bad)}")
print(f"Cosine similarity for pair (turkey, television) = {cosine_similarity(turkey, television)}")
print(f"Cosine similarity for pair (awesome, great) = {cosine_similarity(awesome, great)}")
print(f"Cosine similarity for pair (coffee, giraffe) = {cosine_similarity(coffee, giraffe)}")
print(f"Cosine similarity for pair (cat, barcelona) = {cosine_similarity(cat, barcelona)}")
print(f"Cosine similarity for pair (school, disaster) = {cosine_similarity(school, disaster)}")

Cosine similarity for pair (table, desk) = 0.56312532465622
Cosine similarity for pair (football, baseball) = 0.7990507471765448
Cosine similarity for pair (water, fire) = 0.6159761736263326
Cosine similarity for pair (computer, calculator) = 0.5805204352195886
Cosine similarity for pair (number, math) = 0.3923536921031839
Cosine similarity for pair (boy, girl) = 0.9327198629646993
Cosine similarity for pair (sad, happy) = 0.689063223084822
Cosine similarity for pair (good, bad) = 0.7964893661716318
Cosine similarity for pair (turkey, television) = 0.3478390727581068
Cosine similarity for pair (awesome, great) = 0.54452994054594
Cosine similarity for pair (coffee, giraffe) = 0.039573626896088
Cosine similarity for pair (cat, barcelona) = 0.02882096607257644
Cosine similarity for pair (school, disaster) = 0.2852025050456493


In [18]:
def euclidean_distance(a, b):
    """
    Calculates the Euclidean distance between two vectors.
    Formula: ||A - B||
    """
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    
    return np.linalg.norm(a - b)

print(f"Euclidean Distances:")
print(f"(table, desk) = {euclidean_distance(table, desk)}")
print(f"(football, baseball) = {euclidean_distance(football, baseball)}")
print(f"(water, fire) = {euclidean_distance(water, fire)}")
print(f"(computer, calculator) = {euclidean_distance(computer, calculator)}")
print(f"(number, math) = {euclidean_distance(number, math)}")
print(f"(boy, girl) = {euclidean_distance(boy, girl)}")
print(f"(sad, happy) = {euclidean_distance(sad, happy)}")
print(f"(good, bad) = {euclidean_distance(good, bad)}")
print(f"(turkey, television) = {euclidean_distance(turkey, television)}")
print(f"(awesome, great) = {euclidean_distance(awesome, great)}")
print(f"(coffee, giraffe) = {euclidean_distance(coffee, giraffe)}")
print(f"(cat, barcelona) = {euclidean_distance(cat, barcelona)}")
print(f"(school, disaster) = {euclidean_distance(school, disaster)}")

Euclidean Distances:
(table, desk) = 4.704135012081877
(football, baseball) = 3.718578193888761
(water, fire) = 4.917517613751779
(computer, calculator) = 5.005315848012591
(number, math) = 6.120560145939014
(boy, girl) = 2.0426333096686737
(sad, happy) = 3.8399498989360525
(good, bad) = 3.3188904070491088
(turkey, television) = 6.488383561488301
(awesome, great) = 4.578660135608451
(coffee, giraffe) = 6.440001424040119
(cat, barcelona) = 6.833869604598847
(school, disaster) = 6.54555871572576


### Three Pairs with highest Cosine Similarity
(football,baseball) =0.799\
(boy,girl) = 0.932\
(good,bad) = 0.796

Yes, Football and Baseball shares same semantics as sports equipment and both are ball. Boy ang girl share same age group, semnatically. Good and Bad are outcomes so these make sense.

### Look at the pair (good, bad). They are opposites, yet they have a high cosine similarity score (approx 0.8). Why do you think vector embeddings place them close together?

They represent output or adjective thats why they are close together, also if we go with logic for calculating such vectors, whenever we use good we can replace with bad hence surrounding words remain similar and hence high similarity score.

### Word Analogies

In [19]:
def find_word(a, b, c, data_dict):
    a, b, c = a.lower(), b.lower(), c.lower()
    a_vector, b_vector, c_vector = data_dict[a], data_dict[b], data_dict[c]
    
    all_words = data_dict.keys()
    max_cosine_similarity = -1000
    best_match_word = None
    
    for word in all_words:
        if word in [a, b, c]:
            continue
            
        cos_sim = cosine_similarity(np.subtract(b_vector, a_vector), np.subtract(data_dict[word], c_vector))
        
        if cos_sim > max_cosine_similarity:
            max_cosine_similarity = cos_sim
            best_match_word = word
            
    return best_match_word, cos_sim

In [20]:
def find_word_euclidean(a, b, c, data_dict):
    a, b, c = a.lower(), b.lower(), c.lower()
    a_vector, b_vector, c_vector = data_dict[a], data_dict[b], data_dict[c]
    
    all_words = data_dict.keys()
    
    min_distance = float('inf') 
    best_match_word = None
    
    # Calculate target vector once (b - a)
    target_relationship = np.subtract(b_vector, a_vector)
    
    for word in all_words:
        if word in [a, b, c]:
            continue
        
        # Calculate candidate relationship (d - c)
        candidate_relationship = np.subtract(data_dict[word], c_vector)
        
        # Calculate distance between the two relationship vectors
        dist = euclidean_distance(target_relationship, candidate_relationship)
        
        # FIX 2: Look for the SMALLEST distance (minimize error)
        if dist < min_distance:
            min_distance = dist
            best_match_word = word
            
    return best_match_word, min_distance

In [21]:
words_bag = [
    ('boy', 'girl', 'man'),
    ('bat', 'baseball', 'ball'),
    ('book', 'library', 'coffee'),
    ('orange', 'juice', 'apple'),
    ('turkey', 'turkish', 'colombia')
]

for words in words_bag:
    d, cos_sim = find_word(*words, data_dict)
    print("({}, {}) ----> ({}, {}) with {} difference".format(*words, d, cos_sim))

    d, cos_sim = find_word_euclidean(*words, data_dict)
    print("({}, {}) ----> ({}, {}) with {} difference".format(*words, d, cos_sim))

(boy, girl) ----> (man, woman) with -0.0340757677824383 difference
(boy, girl) ----> (man, woman) with 1.956399832275647 difference
(bat, baseball) ----> (ball, basketball) with 0.09564220586831747 difference
(bat, baseball) ----> (ball, basketball) with 4.251404585612963 difference
(book, library) ----> (coffee, heliospheric) with 0.10581179044448522 difference
(book, library) ----> (coffee, warehouse) with 4.838743250989605 difference
(orange, juice) ----> (apple, juices) with -0.2351728294582547 difference
(orange, juice) ----> (apple, processor) with 5.36831789214182 difference
(turkey, turkish) ----> (colombia, colombian) with 0.1700399408595448 difference
(turkey, turkish) ----> (colombia, colombian) with 2.3834937989013105 difference


In [22]:
def get_nearest_neighbors(target_word, data_dict, top_n=5):
    """
    Finds the top_n words with the highest cosine similarity to the target_word.
    """
    target_word = target_word.lower()
    
    # Check if word exists in the dictionary
    if target_word not in data_dict:
        return f"'{target_word}' not found in the vocabulary."
    
    target_vector = data_dict[target_word]
    similarities = []
    
    # Calculate cosine similarity with every other word
    for word, vector in data_dict.items():
        if word == target_word:
            continue
            
        sim = cosine_similarity(target_vector, vector)
        similarities.append((word, sim))
    
    # Sort by similarity in descending order (highest first)
    # x[1] refers to the similarity score
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities[:top_n]

target = "computer"
neighbors = get_nearest_neighbors(target, data_dict, top_n=6)

print(f"Top 5 Nearest Neighbors for '{target}':")
for word, score in neighbors:
    print(f"{word}: {score:.4f}")

Top 5 Nearest Neighbors for 'computer':
computers: 0.9165
software: 0.8815
technology: 0.8526
electronic: 0.8126
internet: 0.8060
computing: 0.8026


### Lexical Decision Activity

In [23]:
experiment_pairs = [
    ("butter", "bread"),      # Related
    ("doctor", "nurse"),      # Related/Unrelated (Distractor)
    ("tree", "doctor"),       # Unrelated
    ("bread", "doctor")       # Unrelated
]

print("Cosine Similarity for Experiment Pairs")
for w1, w2 in experiment_pairs:
    try:
        # Load words from data_dict
        word1_vec = data_dict[w1]
        word2_vec = data_dict[w2]
        
        # Calculate Cosine Similarity
        cos_sim = cosine_similarity(word1_vec, word2_vec)
        print(f"Cosine similarity for pair ({w1}, {w2}) = {cos_sim:.4f}")
        
    except KeyError as e:
        print(f"Word not found in dictionary: {e}")

print("\nEuclidean Distance for Experiment Pairs")
for w1, w2 in experiment_pairs:
    try:
        # Load words from data_dict
        word1_vec = data_dict[w1]
        word2_vec = data_dict[w2]
        
        # Calculate Euclidean Distance
        euc_dist = euclidean_distance(word1_vec, word2_vec)
        print(f"Euclidean distance for ({w1}, {w2}) = {euc_dist:.4f}")
        
    except KeyError as e:
        print(f"Word not found in dictionary: {e}")

Cosine Similarity for Experiment Pairs
Cosine similarity for pair (butter, bread) = 0.8402
Cosine similarity for pair (doctor, nurse) = 0.7977
Cosine similarity for pair (tree, doctor) = 0.1691
Cosine similarity for pair (bread, doctor) = 0.1844

Euclidean Distance for Experiment Pairs
Euclidean distance for (butter, bread) = 3.3089
Euclidean distance for (doctor, nurse) = 3.1275
Euclidean distance for (tree, doctor) = 6.5470
Euclidean distance for (bread, doctor) = 6.8686
