In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from agent.qdrant import qdrant, fill

In [None]:
all_lang = {
    "ja": 1,
    # 2 is incomplete
    "ko": 3,
    "zh-Hant": 4,
    "fr": 5,
    "de": 6,
    "es": 7,
    "it": 8,
    "en": 9,
    # 10 is missing
    # 11 is a duplicate
    "zh-Hans": 12
}
all_names = pd.read_csv("pokemon_species_names.csv")
all_flavors = pd.read_csv("pokemon_species_flavor_text.csv")

In [None]:
for lang in ["fr", "de", "es", "it", "en"]:
    names = all_names[all_names["local_language_id"] == all_lang[lang]]
    flavors = all_flavors[all_flavors["language_id"] == all_lang[lang]]
    fill(
        names,
        flavors,
        lang,
    )

In [None]:
all_points = qdrant.scroll(
    collection_name = "pokemon_fr",
    limit=300,
    with_payload=True,
    with_vectors=True
)[0]
embeddings = []
ids = []
for point in all_points:
    embeddings.append(point.vector)
    ids.append(point.payload.get("name", point.id))
embeddings = np.array(embeddings)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)
explained_variance = pca.explained_variance_ratio_ * 100
plt.figure(figsize=(8, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
for i, name in enumerate(ids):
    plt.annotate(name, (pca_result[i, 0], pca_result[i, 1]), fontsize=8, alpha=0.8)
plt.xlabel(f"PC1 ({explained_variance[0]:.2f}%)")
plt.ylabel(f"PC2 ({explained_variance[1]:.2f}%)")
plt.title("PCA Projection of Pok√©mon Embeddings")
plt.grid(True)
plt.savefig("pca.png", dpi=150, bbox_inches="tight")
plt.close()