 ### Setup: Packages + Model laden

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import umap
import matplotlib.pyplot as plt


In [None]:
!pip install sentence-transformers torch pandas scikit-learn umap-learn matplotlib


###  Laad het model

In [None]:
# Model laden met retrieval-passage task
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

###  Laad de dataset (csv)

In [None]:
# Laad je dataset
df = pd.read_csv('your_dataset.csv')  # <-- "articles_cleaned" CSV bestand hier

# Combineer velden voor embeddings
df['combined_text'] = df['title'].fillna('') + ". " + df['description'].fillna('') + ". " + df['content'].fillna('')


### Genereer embeddings voor ALLE artikelen

In [None]:
# Kies alle teksten
texts = df['combined_text'].tolist()

# Genereer embeddings
embeddings = model.encode(texts, task='retrieval.passage', batch_size=64, show_progress_bar=True)

# Sla embeddings op (bijv. als numpy)
import numpy as np
np.save('article_embeddings.npy', embeddings)


#### EV-query + Similarities berekenen

In [None]:
# EV gerelateerde queries
queries = [
    "elektrische auto's",
    "laadpalen en accutechnologie",
    "elektrisch rijden subsidies en beleid",
]

# Query embeddings
query_embeddings = model.encode(queries, task='retrieval.query')

# Gemiddelde query embedding
query_embedding = query_embeddings.mean(axis=0)

# Similarities berekenen
similarities = cosine_similarity([query_embedding], embeddings)[0]

# Similarities aan DataFrame toevoegen
df['similarity'] = similarities

# Optioneel: meest relevante artikelen tonen
df_sorted = df.sort_values('similarity', ascending=False)
print(df_sorted[['title', 'similarity']].head(10))


#### (Optioneel) UMAP + Scatterplot

In [None]:
# Reduce embeddings
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

# Voeg 2D-coördinaten toe
df['x'] = embedding_2d[:, 0]
df['y'] = embedding_2d[:, 1]

# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    df['x'],
    df['y'],
    c=df['similarity'],
    cmap="coolwarm",
    s=30,
    alpha=0.6
)
plt.colorbar(scatter, label="Similarity to EV query")
plt.title("UMAP van Artikelen (Jina Embeddings)", fontsize=14)
plt.show()
