In [None]:
# When using a colab notebook:
#!wget https://github.com/Aleph-Alpha/examples/blob/main/bootcamp/data.md
#!wget https://github.com/Aleph-Alpha/examples/blob/main/requirements.txt
#!pip install -r requirements.txt

In [None]:
from aleph_alpha_client import Client, SemanticEmbeddingRequest, SemanticEmbeddingResponse, SemanticRepresentation, Prompt, TextControl
from scipy import spatial
import numpy as np
import os
from dotenv import load_dotenv

from langchain.llms import AlephAlpha
from langchain.embeddings import AlephAlphaSymmetricSemanticEmbedding, AlephAlphaAsymmetricSemanticEmbedding

import plotly.express as px
import plotly.graph_objects as go

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [None]:
load_dotenv()

client = Client(token=os.getenv("AA_TOKEN"))

## Let's use luminous embeddings as a classifier


### Step 1:
Define three classes that we want to classify.
If you don't know any, you can use the following:
Three classes of incoming emails:
- class1 = IT support
- class2 = HR
- class3 = Sales

Each class should have at least 3 examples.

In [None]:
# Here we define three classes that we want to classify
# add at least 3 elements to each class
class_1 = []

class_2 = []

class_3 = []

### Next we generate embeddings for each class

In [None]:
# TODO: Use the AlephAlpha client to embed the sentences in the two classes
embeddings_class_1 = # TODO create embeddings for class 1
embeddings_class_2 = # TODO create embeddings for class 2
embeddings_class_3 = # TODO create embeddings for class 3


new_sentence = "Hey, my stupid Internet isn't working. Can you help me?"
sentence_embedding = # TODO create embedding for new sentence

Great, now that we have the classes and embeddings, let's see how we can use them to classify a new sentence.
If everything went well, the new sentence should have the highest cosine similarity with the class it belongs to.

In [None]:
# TODO: get the average similarity of the new sentence to the two classes

similarities_class_1 = # TODO get the similarities of the new sentence to the embeddings of class 1
similarities_class_2 = # TODO get the similarities of the new sentence to the embeddings of class 2
similarities_class_3 = # TODO get the similarities of the new sentence to the embeddings of class 3

# TODO get the average similarity of the new sentence to the two classes
avg_similarity_class_1 = np.mean(similarities_class_1)
avg_similarity_class_2 = np.mean(similarities_class_2)
avg_similarity_class_3 = np.mean(similarities_class_3)

print("Similarity to class 1: ", avg_similarity_class_1)
print("Similarity to class 2: ", avg_similarity_class_2)
print("Similarity to class 3: ", avg_similarity_class_3)

### Visualize the data

To better understand what Luminous is doing, let's visualize the data. 
You don't need to understand the code below, but you can see that the data is displayed as a scatter plot.

As you can see, the features `Luminous` extracts can be used to separate the data into several classes.

In [None]:
# Let's use PCA to reduce the dimensionality of the embeddings to 2D

pca = PCA(n_components=2)
pca.fit(embeddings_class_1 + embeddings_class_2 + embeddings_class_3)
pca_embeddings_class_1 = pca.transform(embeddings_class_1)
pca_embeddings_class_2 = pca.transform(embeddings_class_2)
pca_embeddings_class_3 = pca.transform(embeddings_class_3)
pca_embeddings_new_sentence = pca.transform([sentence_embedding])

# Now let's plot the embeddings from all three classes

fig = go.Figure()

for i, embeddings in enumerate([pca_embeddings_class_1, pca_embeddings_class_2, pca_embeddings_class_3, pca_embeddings_new_sentence]):

    fig.add_trace(go.Scatter(
        x=embeddings[:,0],
        y=embeddings[:,1],
        mode="markers",
        name=f"Class {i+1}",
        marker=dict(
            size=12,
            color=["red", "green", "blue", "yellow"][i],
        ),
        text=class_1 + class_2 + class_3 + [new_sentence],
        hovertemplate=
        "<b>%{text}</b><br><br>" +

        "<i>Similarity to new sentence:</i><br>" +
        "%{marker.color:.2f}<br>" +
        "<extra></extra>"
    ))

    

fig.update_traces(textposition='top center')

fig.show()

### Let's actually train a classifier on these embeddings

You don't have to only rely on cosine similarity. You can train a classifier on these embeddings and use that to predict the class of a new sentence.

In [None]:
# Define a classifier
clf = # TODO create a Nearest Neighbors classifier with 3 neighbors (link to documentation: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
# use the embeddings and the class labels to train a classifier
X = # TODO create a list of embeddings
y = # TODO create a list of class labels

# fit the classifier
clf.fit(X, y)

print("Predicted class: ", clf.predict([sentence_embedding]))

In [None]:
# Let's try a different classifier
svm = # TODO create a Support Vector Machine classifier (link to documentation: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
svm.fit(X, y)

print("Predicted class: ", svm.predict([sentence_embedding]))

# get the probabilities for each class
print("Probabilities: ", svm.predict_proba([sentence_embedding]))