In [20]:
from aleph_alpha_client import Client, SemanticEmbeddingRequest, SemanticEmbeddingResponse, SemanticRepresentation, Prompt, TextControl
from scipy import spatial
import numpy as np
import os
from dotenv import load_dotenv

from langchain.llms import AlephAlpha
from langchain.embeddings import AlephAlphaSymmetricSemanticEmbedding, AlephAlphaAsymmetricSemanticEmbedding

from sklearn.neighbors import KNeighborsClassifier

In [2]:
load_dotenv()

client = Client(token=os.getenv("AA_TOKEN"))

## Let's use luminous embeddings as a classifier

In [3]:
# Here we define two classes that we want to classify
class_1 = ["I want to get a new ID card, because I lost mine.",
           "Where can I request a new ID?",
           "My personal Identification document is lost, I need a new one.",
           "Please help me, my ID has gone missing.",]

class_2 = ["I am required to register as a new citizen.",
           "I just moved to this country, how do I register?",
           "Could you please help me register as a new citizen?",
           "I have a new address, how do I register it?"]

### Next we generate embeddings for each class

In [60]:
# TODO: Use the AlephAlpha client to embed the sentences in the two classes
embeddings_class_1 = [client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(text), representation=SemanticRepresentation.Symmetric), model="luminous-base").embedding for text in class_1]
embeddings_class_2 = [client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(text), representation=SemanticRepresentation.Symmetric), model="luminous-base").embedding for text in class_2]

new_sentence = "Ich habe meinen Ausweis verloren, ich brauche einen neuen."
sentence_embedding = client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(new_sentence), representation=SemanticRepresentation.Symmetric), model="luminous-base").embedding

In [61]:
# TODO: get the average similarity of the new sentence to the two classes

similarities_class_1 = [1 - spatial.distance.cosine(sentence_embedding, embedding) for embedding in embeddings_class_1]
similarities_class_2 = [1 - spatial.distance.cosine(sentence_embedding, embedding) for embedding in embeddings_class_2]

# TODO get the average similarity of the new sentence to the two classes
avg_similarity_class_1 = np.mean(similarities_class_1)
avg_similarity_class_2 = np.mean(similarities_class_2)

print("Similarity to class 1: ", avg_similarity_class_1)
print("Similarity to class 2: ", avg_similarity_class_2)

Similarity to class 1:  0.8308171888848571
Similarity to class 2:  0.49238100823376585


### Let's actually train a classifier on these embeddings

In [62]:
# Define a classifier
clf = KNeighborsClassifier(n_neighbors=3)
# use the embeddings and the class labels to train a classifier
X = [np.array(embedding) for embedding in embeddings_class_1 + embeddings_class_2]
y = np.array([0] * len(embeddings_class_1) + [1] * len(embeddings_class_2))

# fit the classifier
clf.fit(X, y)

print("Predicted class: ", clf.predict([sentence_embedding]))

Predicted class:  [0]
