In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from qdrant_client.http import models
from myAgent.qdrant import qdrant
from myAgent.ollama import embed
import uuid

In [None]:
all_lang = {
    "fr": 5,
    "en": 9,
}
all_names = pd.read_csv("pokemon_species_names.csv")
all_flavors = pd.read_csv("pokemon_species_flavor_text.csv")
threshold = .9 # cosinus similarity from which we consider 2 vectors as redondant

In [None]:
for lang in ["fr","en"]:

    names = all_names[all_names["local_language_id"] == all_lang[lang]]
    flavors = all_flavors[all_flavors["language_id"] == all_lang[lang]]
    collection_descriptions = f"description_{lang}"
    collection_names = f"name_{lang}"
    
    qdrant.delete_collection(collection_name = collection_descriptions)
    qdrant.delete_collection(collection_name = collection_names)
    qdrant.create_collection(
        collection_name = collection_descriptions,
        vectors_config=models.VectorParams(
            size= len(embed("lorem ipsum")),
            distance=models.Distance.COSINE
        )
    )
    qdrant.create_collection(
        collection_name = collection_names,
        vectors_config = models.VectorParams(
            size = 1,
            distance = models.Distance.COSINE
        )
    )

    for id in names["pokemon_species_id"].unique().tolist():
    
        # Collect the pokemon name & description
        name = names[names["pokemon_species_id"] == id]["name"].tolist()[0]
        genus = names[names["pokemon_species_id"] == id]["genus"].tolist()[0]
        flavor_list = flavors[flavors["species_id"] == id]["flavor_text"].tolist()
        flavor_list = [x.replace("\r", "").replace("\n", " ") for x in flavor_list]
        
        # embeddings
        vector_list = [embed(x) for x in flavor_list]
        
        # remove redundant descriptions
        if len(vector_list) == 0:
            continue
            
        elif len(vector_list) == 1:
            keep = [True]
            
        elif len(vector_list) > 1:
            
            ## step 1: compute all cosines
            X = np.asarray(vector_list, dtype=float)
            C = cosine_similarity(X)
            
            ## step 2: remove pairs above the threshold
            n = C.shape[0]
            keep = [True] * n
            for i in range(n):
                for j in range(i):
                    if C[i, j] > threshold:
                        keep[i] = False
                        break

        # update flavors and vectors
        flavor_list = [x for i,x in enumerate(flavor_list) if keep[i]]
        vector_list = [x for i,x in enumerate(vector_list) if keep[i]]
    
        # prepare upsert
        points = [
            models.PointStruct(
                id = uuid.uuid4().int >> 64,
                vector = vector_list[i],
                payload={
                    "id": int(id),
                    "name": name,
                    "type": genus,
                    "description": flavor_list[i],
                }
            )
            for i in range(sum(keep))
        ]
    
        # upsert
        qdrant.upsert(
            collection_name = collection_descriptions,
            points = points
        )

        # also keep the pokemon name
        qdrant.upsert(
            collection_name = collection_names,
            points=[models.PointStruct(
                id=id,
                vector=[0.0],
                payload={"name": name}
            )]
        )

In [None]:
all_points = qdrant.scroll(
    collection_name = "pokemon_fr",
    limit=300,
    with_payload=True,
    with_vectors=True
)[0]
embeddings = []
ids = []
for point in all_points:
    embeddings.append(point.vector)
    ids.append(point.payload.get("name", point.id))
embeddings = np.array(embeddings)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)
explained_variance = pca.explained_variance_ratio_ * 100
plt.figure(figsize=(8, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
for i, name in enumerate(ids):
    plt.annotate(name, (pca_result[i, 0], pca_result[i, 1]), fontsize=8, alpha=0.8)
plt.xlabel(f"PC1 ({explained_variance[0]:.2f}%)")
plt.ylabel(f"PC2 ({explained_variance[1]:.2f}%)")
plt.title("PCA Projection of Pok√©mon Embeddings")
plt.grid(True)
plt.savefig("pca.png", dpi=150, bbox_inches="tight")
plt.close()