# Eindopdracht 1: Vector Space Models & Principle Component Analysis

Naam: Sietse Neve
Studentnummer: 1810364

## Deel I: Vector Space Models

### 1.

In [2]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [19]:
df = pd.read_csv(
    'capitals.txt',
    sep=r'\s+',          
    header=None,
    names=['city1', 'country1', 'city2', 'country2'],
    engine='python',
)
df.head()


Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Baghdad,Iraq
1,Athens,Greece,Bangkok,Thailand
2,Athens,Greece,Beijing,China
3,Athens,Greece,Berlin,Germany
4,Athens,Greece,Bern,Switzerland


In [20]:
with open('word_embeddings_subset.p', 'rb') as f:
    embeddings = pickle.load(f)

print("Eerste paar rijen uit capitals.txt:")
print(df.head(), "\n")

num_embeddings = len(embeddings)
sample_word = list(embeddings.keys())[0]
embedding_dim = len(embeddings[sample_word])

print(f"Aantal embeddings: {num_embeddings}")
print(f"Voorbeeldwoord: {sample_word}")
print(f"Dimensie van embedding: {embedding_dim}")

assert num_embeddings == 243, "Aantal embeddings klopt niet (verwacht: 243)"
assert embedding_dim == 300, "Dimensie van embedding klopt niet (verwacht: 300)"
print("\n Embeddings hebben de verwachte vorm!")


Eerste paar rijen uit capitals.txt:
    city1 country1    city2     country2
0  Athens   Greece  Baghdad         Iraq
1  Athens   Greece  Bangkok     Thailand
2  Athens   Greece  Beijing        China
3  Athens   Greece   Berlin      Germany
4  Athens   Greece     Bern  Switzerland 

Aantal embeddings: 243
Voorbeeldwoord: country
Dimensie van embedding: 300

 Embeddings hebben de verwachte vorm!


In [21]:

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """
    Bereken de cosine similarity tussen twee word vectors.
    
    Parameters:
    a (np.ndarray): vector 1
    b (np.ndarray): vector 2
    
    Returns:
    float: cosine similarity waarde tussen -1 en 1
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    if norm_a == 0 or norm_b == 0:
        raise ValueError("Een van de vectors heeft norm 0.")
    
    return dot_product / (norm_a * norm_b)


In [22]:
vec1 = np.array([1, 2, 3])
vec2 = np.array([1, 2, 3])
vec3 = np.array([-1, -2, -3])

print("vec1 vs vec2:", cosine_similarity(vec1, vec2))  # ~1.0 (identiek)
print("vec1 vs vec3:", cosine_similarity(vec1, vec3))  # ~-1.0 (tegengesteld)


vec1 vs vec2: 1.0
vec1 vs vec3: -1.0


In [23]:
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
    """
    Bereken de Euclidean distance tussen twee word vectors.
    
    Parameters:
    a (np.ndarray): vector 1
    b (np.ndarray): vector 2
    
    Returns:
    float: Euclidean distance (>= 0)
    """
    if a.shape != b.shape:
        raise ValueError("De vectoren hebben niet dezelfde dimensie.")
    
    return np.linalg.norm(a - b)


In [24]:
vec1 = np.array([1, 2, 3])
vec2 = np.array([1, 2, 3])
vec3 = np.array([4, 5, 6])

print("vec1 vs vec2:", euclidean_distance(vec1, vec2))  # 0.0 (identiek)
print("vec1 vs vec3:", euclidean_distance(vec1, vec3))  # > 0 (verschillend)


vec1 vs vec2: 0.0
vec1 vs vec3: 5.196152422706632


In [25]:
vec_king = embeddings['king']
vec_queen = embeddings['queen']

cos_sim = cosine_similarity(vec_king, vec_queen)

euc_dist = euclidean_distance(vec_king, vec_queen)

print(f"Cosine similarity (king, queen): {cos_sim:.7f}")
print(f"Euclidean distance (king, queen): {euc_dist:.7f}")


Cosine similarity (king, queen): 0.6510956
Euclidean distance (king, queen): 2.4796925


In [26]:
def find_country_for_city(city1: str, country1: str, city2: str, embeddings: dict):
    """
    Vind het meest waarschijnlijke land bij city2 op basis van een analoge relatie.

    city1: hoofdstad van country1
    country1: land van city1
    city2: hoofdstad van een ander land
    embeddings: dictionary {woord: vector}

    Returns:
    tuple: (gevonden_land, similarity_score)
    """
    for word in [city1, country1, city2]:
        if word not in embeddings:
            raise ValueError(f"'{word}' niet gevonden in embeddings.")

    target_vector = embeddings[city2] - embeddings[city1] + embeddings[country1]

    best_word = None
    best_similarity = -float('inf')

    for word, vector in embeddings.items():
        if word in [city1, country1, city2]:
            continue  # inputwoorden overslaan
        sim = cosine_similarity(target_vector, vector)
        if sim > best_similarity:
            best_similarity = sim
            best_word = word

    return best_word, best_similarity


In [27]:
result = find_country_for_city("Athens", "Greece", "Cairo", embeddings)
print(f"Resultaat: {result}")

expected_country = "Egypt"
expected_similarity = 0.7626822

assert result[0] == expected_country, f"Fout: verwacht {expected_country}, kreeg {result[0]}"
assert abs(result[1] - expected_similarity) < 1e-7, (
    f"Fout: verwacht {expected_similarity}, kreeg {result[1]}"
)

print("\n Test geslaagd!")


Resultaat: ('Egypt', np.float32(0.7626822))

 Test geslaagd!


In [31]:
country_candidates = set(df['country1']).union(set(df['country2']))
country_candidates = {c for c in country_candidates if c in embeddings}

used = 0
skipped = 0
correct = 0
predictions = []

for _, row in df.iterrows():
    city1, country1, city2, gold_country = row['city1'], row['country1'], row['city2'], row['country2']

    pred_country, sim = find_country_for_city_candidates(city1, country1, city2, embeddings, country_candidates)
    predictions.append((gold_country, pred_country, sim, (city1, country1, city2)))
    correct += int(pred_country == gold_country)
    used += 1

accuracy = correct / used if used else 0.0
print(f"Accuracy: {accuracy:.4f}  (verwacht ~0.92)")

Accuracy: 0.9251  (verwacht ~0.92)


city1       Athens_Greece_Baghdad_Iraq
country1                           nan
city2                              nan
country2                           nan
Name: 0, dtype: object
Athens_Greece_Baghdad_Iraq in embeddings? False
nan in embeddings? False
nan in embeddings? False
nan in embeddings? False
