In [None]:
text = """
Un jour, il y eut une petite fille qui vivait dans une forêt dense et mystérieuse.
Elle avait des cheveux noirs que la lumière du soleil ne pouvait pas éclaircir, et
ses yeux étaient un bleu plus profond qu'un lac au fond d'une grotte. Elle était
solitaire et aimait les oiseaux, qui volaient dans les arbres de sa forêt, et les
animaux, qui se balançaient à travers la végétation. Un jour, elle trouva un chat
sauvage au bord du chemin. C'était le plus beau chat qu'elle avait jamais vu, avec
des yeux vifs et des oreilles pointues. Elle l'adopta immédiatement, sans se soucier
de ce qu'en disent d'autres personnes. Les années ont passé et la petite fille est
devenue une jeune femme, mais le chat était toujours là avec elle, gardant sa
compagnie et protégeant son cœur. Elle était heureuse et ne voulait jamais quitter
sa forêt mystérieuse. Cependant, un jour, elle reçut une lettre qui lui ordonnait
de partir pour une autre ville loin de la forêt.
"""

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder = "/project/models/")

tokens = model.tokenize([text[:1000]])
print(len(tokens["input_ids"][0]))
tokens = model.tokenize([text[:900]])
print(len(tokens["input_ids"][0]))
tokens = model.tokenize([text[:750]])
print(len(tokens["input_ids"][0]))

In [None]:
print(text[:750])

In [None]:
tokens.keys()

In [None]:
tokens["input_ids"]

In [None]:
tokens["token_type_ids"]

In [None]:
tokens["attention_mask"]

In [None]:
embedding = model.encode(text[:750])
embedding[:10]

In [None]:
sentences = {
    "les_chiens_sont_fideles": "Les chiens, compagnons fidèles et dévoués, ont depuis toujours été les plus loyaux amis de l’homme, tissant avec lui un lien indéfectible fondé sur l’amour inconditionnel, la protection et une complicité inégalée. Que ce soit en tant que chiens de travail, de garde ou simplement de compagnie, ils se distinguent par leur intelligence, leur empathie et leur capacité à ressentir les émotions humaines, souvent mieux que bien des êtres humains eux-mêmes. Leur joie communicative, leur enthousiasme débordant à chaque retrouvaille et leur affection sans réserve font d’eux des alliés incomparables dans la vie quotidienne. Ils sont là dans les moments de bonheur, partageant notre excitation, et encore plus présents dans les instants difficiles, apportant réconfort et chaleur par leur simple présence. ",
    "dogs_are_loyal": "Dogs, faithful and devoted companions, have always been the most loyal friends of man, weaving an unbreakable bond with him based on unconditional love, protection, and unparalleled companionship. Whether as working dogs, guard dogs, or simply pets, they stand out for their intelligence, empathy, and ability to sense human emotions—often better than many humans themselves. Their contagious joy, boundless enthusiasm at every reunion, and unwavering affection make them incomparable allies in daily life. They are there in moments of happiness, sharing our excitement, and even more present in difficult times, offering comfort and warmth through their mere presence.",
    "dogs_aint_just_pets": "Dogs? Man, they ain't just pets—they’re the realest ride-or-die homies you’ll ever have. No fakin’, no switchin’ sides, just straight-up loyalty, love, and protection. Whether they’re out here working, guarding the spot, or just chillin’ at home, these four-legged legends got instincts sharper than most people. They can read your vibe, feel your moods, and they’ll stick by you no matter what. Pull up after a long day? They’re hyped like you won the lottery. Feeling down? They’re right there, no questions asked, giving you that quiet, unshakable support. Dogs don’t judge, don’t lie, and never leave you hanging—they’re the real OGs of companionship.",
    "wolves_hunting": "A wolf pack, usually composed of an alpha pair, their offspring, and a few other adult members, operates like a well-coordinated unit. Their first step in a hunt is selection. They scan the landscape, looking for signs of weak or vulnerable prey—an injured deer, an old elk, or a young bison lagging behind the herd. Once identified, the wolves move into position, spreading out to surround their target while remaining undetected.",
    "wolf_taming": "I had been following a set of tracks near the river, my breath forming white clouds in the crisp winter air. The storm from the night before had left the forest hushed, its usual sounds muffled beneath thick layers of white. That’s when I saw him—a lone wolf, barely more than a pup, his fur matted with ice, his ribs showing through his silver coat.",
    "unlucky_shopping": "I only needed a few things—bread, milk, and a bottle of wine for dinner—but, of course, the express checkout was packed with people who clearly had more than ten items. I picked the shortest line, hoping for the best, but the person in front of me decided to argue about a discount that didn’t apply. Minutes dragged on, my patience wearing thin.",
    "lucky_shopping": """The store smelled of freshly baked bread, and the distant hum of chatter filled the air. As I wandered down the aisles, I noticed a bright red sign that caught my attention: "LIMITED-TIME PROMOTION – 90% OFF SELECT ITEMS!" My heart skipped a beat. I wasn't expecting to find any deals today, but this was something else. Curious, I hurried toward the section where a small crowd had already gathered. There, stacked neatly on the shelves, were premium chocolates, imported coffee, and high-end olive oil—all marked down to ridiculous prices. A woman next to me grabbed two bottles of extra virgin olive oil and smiled. "This must be a mistake," I muttered under my breath" """,
}
vectors = model.encode(list(sentences.values()))
vectors

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

df = pd.DataFrame(
    vectors,
    index = list(sentences.keys())
)

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
pca = PCA(n_components=2) 
principal_components = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(principal_components, columns=['PC1', 'PC2'], index=df.index)

explained_var = pca.explained_variance_ratio_ * 100
xlabel = f'PC 1, explained variance: {explained_var[0]:.2f}%'
ylabel = f'PC 2, explained variance: {explained_var[1]:.2f}%'

# Plot PCA with labels
plt.figure(figsize=(8, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'])

# Add observation names next to points
for i, txt in enumerate(df_pca.index):
    plt.annotate(txt, (df_pca['PC1'].iloc[i], df_pca['PC2'].iloc[i]), fontsize=12, ha='right', va='bottom')

plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title('PCA of the embedding space\nencoder: all-MiniLM-L6-v2')
plt.show()


In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

# Connect to the ChromaDB server
chroma_client = chromadb.HttpClient(host="chroma", port=8000)

# Create or get a collection
collection = chroma_client.get_or_create_collection(name="tests")

In [None]:
collection.add(
    ids=[uuid.uuid4().hex + uuid.uuid4().hex for i in range(len(sentences))],
    embeddings = vectors,
    metadatas = [{
        "text": name
    } for name in sentences.keys()]
)

In [None]:
import chromadb
chroma_client = chromadb.HttpClient(host="chroma", port=8000)
collection = chroma_client.get_collection("tests")
all_items = collection.get(include=["embeddings", "documents", "metadatas"])

print("IDs:", all_items["ids"])
print("Embeddings:", all_items["embeddings"])
print("Metadata:", all_items["metadatas"])
print("Documents:", all_items["documents"])

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder = "/project/models/")

vectors = model.encode(list(sentences.values()))
df = pd.DataFrame(
    vectors,
    index = list(sentences.keys())
)

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
pca = PCA(n_components=2) 
principal_components = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(principal_components, columns=['PC1', 'PC2'], index=df.index)

explained_var = pca.explained_variance_ratio_ * 100  # Convert to percentage
xlabel = f'PC 1, explained variance: {explained_var[0]:.2f}%'
ylabel = f'PC 2, explained variance: {explained_var[1]:.2f}%'

# Plot PCA with labels
plt.figure(figsize=(8, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'])

# Add observation names next to points
for i, txt in enumerate(df_pca.index):
    plt.annotate(txt, (df_pca['PC1'].iloc[i], df_pca['PC2'].iloc[i]), fontsize=12, ha='right', va='bottom')

plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title('PCA of the embedding space\nencoder: all-mpnet-base-v2')
plt.show()


In [None]:
import chromadb

client = chromadb.HttpClient(host = "chroma", port = 8000)
print(client.list_collections())

client.delete_collection("tests")
print(client.list_collections())