# Eindopdracht 2: Machine Translation & Document Search

Naam: Sietse Neve

Studentnummer: 1810364

In [None]:
import pickle
import numpy as np

# -----------------------
# 1. Data inladen
# -----------------------

# Engelse embeddings
with open("en_embeddings.p", "rb") as f:
    en_embeddings = pickle.load(f)

# Franse embeddings
with open("fr_embeddings.p", "rb") as f:
    fr_embeddings = pickle.load(f)

# Train- en testset
train_pairs = []
with open("en-fr.train.txt", "r", encoding="utf-8") as f:
    for line in f:
        en, fr = line.strip().split()
        train_pairs.append((en, fr))

test_pairs = []
with open("en-fr.test.txt", "r", encoding="utf-8") as f:
    for line in f:
        en, fr = line.strip().split()
        test_pairs.append((en, fr))

# -----------------------
# 2. Lossfunctie & gradient
# -----------------------

def loss_and_gradient(W, X, Y):
    """
    Bereken de loss (Frobenius norm) en de bijbehorende gradient.

    Parameters
    ----------
    W : np.ndarray
        Huidige transformatie matrix (dim n x n)
    X : np.ndarray
        Bronvectors (dim m x n)
    Y : np.ndarray
        Doelvectors (dim m x n)

    Returns
    -------
    loss : float
        Frobenius norm loss
    gradient : np.ndarray
        Gradient van de loss naar W
    """
    diff = X @ W - Y
    loss = np.sum(diff ** 2) / X.shape[0]
    gradient = (2 / X.shape[0]) * X.T @ diff
    return loss, gradient

# -----------------------
# 3. Train/test filteren en matrices bouwen
# -----------------------
def filter_pairs(pairs, en_emb, fr_emb):
    kept = []
    for en, fr in pairs:
        if (en in en_emb) and (fr in fr_emb):
            kept.append((en, fr))
    return kept

train_pairs_f = filter_pairs(train_pairs, en_embeddings, fr_embeddings)
test_pairs_f  = filter_pairs(test_pairs,  en_embeddings, fr_embeddings)

print(f"Train: {len(train_pairs_f)} kept")
print(f"Test : {len(test_pairs_f)} kept")

X_train = np.array([en_embeddings[en] for en, fr in train_pairs_f])  # (m, n)
Y_train = np.array([fr_embeddings[fr] for en, fr in train_pairs_f])  # (m, n)

n_dim = X_train.shape[1]
W = np.eye(n_dim)

# -----------------------
# 4. Gradient descent (400 stappen, lr=0.8)
# -----------------------
learning_rate = 0.8
steps = 400

for step in range(steps):
    loss, grad = loss_and_gradient(W, X_train, Y_train)
    W -= learning_rate * grad
    if step % 50 == 0 or step == steps - 1:  # ook laatste stap printen
        print(f"Stap {step}: loss = {loss:.4f}")

# -----------------------
# 5. Cosine similarity (helper)
# -----------------------
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# -----------------------
# 6. KNN (k=1) met vectorisatie over alle Franse woorden
# -----------------------
# Maak een matrix met alle FR-embeddings (rijen) en bijbehorende woordenlijst
fr_words = list(fr_embeddings.keys())
F = np.array([fr_embeddings[w] for w in fr_words])          # (N_fr, n)
F_norm = F / np.linalg.norm(F, axis=1, keepdims=True)       # nodig voor cosine

def translate(word, W, en_embeddings, fr_words, F_norm):
    v = en_embeddings[word] @ W                              # (n,)
    v = v / np.linalg.norm(v)                                # cosine: unit vector
    sims = F_norm @ v                                        # (N_fr,)
    idx = int(np.argmax(sims))
    return fr_words[idx]

# -----------------------
# 7. Accuracy op gefilterde testset
# -----------------------
correct = 0
incorrect = []
for en, fr in test_pairs_f:
    pred = translate(en, W, en_embeddings, fr_words, F_norm)
    if pred == fr:
        correct += 1

    else:
        incorrect.append(pred)

accuracy = correct / len(test_pairs_f) if test_pairs_f else 0.0
print(f"Accuracy: {accuracy:.4f}")

