In [None]:
# When using a colab notebook:
#!git clone https://github.com/Aleph-Alpha/examples.git
#!pip install -r examples/requirements.txt
#!cp examples/bootcamp/data.md data.md

In [None]:
from aleph_alpha_client import Client, SemanticEmbeddingRequest, CompletionRequest, SemanticEmbeddingResponse, SemanticRepresentation, Prompt, TextControl
from scipy import spatial
import numpy as np
import os
from dotenv import load_dotenv

from langchain.llms import AlephAlpha
from langchain.embeddings import AlephAlphaSymmetricSemanticEmbedding, AlephAlphaAsymmetricSemanticEmbedding

In [None]:
load_dotenv()

client = Client(token=os.getenv("AA_TOKEN"))

#### Step 1: Use the API to create an embedding of a text
Use the "Semantic Search" API to create an embedding of a text. The API returns a JSON object with the embedding of the text.
You can read more about the semantic search API here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed

#### Step 2: Use the API to find out how similar these three texts are
Use scipys cosine_similarity 'spatial.distance.cosine' to find out how similar these three texts are. The function returns a number between 0 and 1, where 0 means the texts are completely different and 1 means the texts are identical.
Remember that cosine_similarity returns a distance, not a similarity. So you need to subtract the distance from 1 to get the similarity.

In [None]:
texts = ["The sun is shining", 
         "It's pretty sunny today", 
         "Her smile shines brightly down upon the south african people"
         ]

In [None]:
# TODO: create an embedding request (Symmetric), get the response, and extract the embedding
text_embeddings = []
for text in texts:
    embedding_request = SemanticEmbeddingRequest(prompt=Prompt.from_text(text), representation=SemanticRepresentation.Symmetric, compress_to_size=128)
    embedding_response = client.semantic_embed(embedding_request, model="luminous-base")
    embedding = embedding_response.embedding
    text_embeddings.append(embedding)

# TODO: Calculate the cosine similarity between the embeddings
similarity_1_2 = 1 - spatial.distance.cosine(text_embeddings[0], text_embeddings[1])
similarity_1_3 = 1 - spatial.distance.cosine(text_embeddings[0], text_embeddings[2])
similarity_2_3 = 1 - spatial.distance.cosine(text_embeddings[1], text_embeddings[2])

print(f"Similarity between {texts[0][:10]} and {texts[1][:10]}", similarity_1_2)
print(f"Similarity between text {texts[0][:10]} and {texts[2][:10]}: ", similarity_1_3)
print(f"Similarity between text {texts[1][:10]} and {texts[2][:10]}: ", similarity_2_3)

In [None]:
# semantic similarity with langchain    
embeddings = AlephAlphaSymmetricSemanticEmbedding()

text_embeddings = embeddings.embed_documents(texts)

for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity = 1 - spatial.distance.cosine(text_embeddings[i], text_embeddings[j])
        print(f"Similarity between {texts[i][:10]} and {texts[j][:10]}", similarity)

#### Step 3: Use the API on an asymmetric embedding case to find the answer to the question
Asymmetric embeddings are useful when you want to find the answer to a question. For example, if you want to find the answer to the question "What is the capital of France?", you can use the API to create an embedding of the question and an embedding of the answer. Then you can use the cosine_similarity function to find out how similar the question and the answer are. The answer is the one with the highest similarity.

We will try this on parts of the manual.

You can find the documentation on the asymmetric embedding here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/#code-example

In [None]:
# Read the data in the data.md file
with open("data.md", "r", encoding="utf-8") as f:
    data = f.read()
    
# Split the data into a list of texts
texts = data.split("#")

# remove the first element of the list
texts = texts[1:]

print(f"data: {data[:100]}")


In [None]:
# Loading in the text files
question = "What are macr trends?"

In [None]:
# TODO: create embeddings (Document) for the contexts and the question (Query)
embedded_contexts = []
for context in texts:
    # TODO: create an embedding request (Document), 
    # get the response, 
    # and extract the embedding
    embedding_request = SemanticEmbeddingRequest(
        prompt=Prompt.from_text(context), 
        representation=SemanticRepresentation.Document, 
        compress_to_size=128)
    embedding_response = client.semantic_embed(embedding_request, model="luminous-base")
    embedding = embedding_response.embedding
    embedded_contexts.append(embedding)

# TODO
# create an embedding request (Query), 
# get the response, 
# and extract the embedding
embedded_question = client.semantic_embed(
    SemanticEmbeddingRequest(
        prompt=Prompt.from_text(question), 
        representation=SemanticRepresentation.Query, 
        compress_to_size=128), 
    model="luminous-base").embedding



In [None]:
# TODO: create embeddings (Document) for the contexts and the question (Query) with langchain

# Load the embedding model
embeddings = AlephAlphaAsymmetricSemanticEmbedding()

# TODO: create embeddings (Document) for the contexts and the question (Query)
embedded_contexts = embeddings.embed_documents(texts)
embedded_question = embeddings.embed_query(question)

In [None]:
# TODO: Calculate the cosine similarity between the embeddings
similarities = []
for embedded_context in embedded_contexts:
    # TODO: Calculate the cosine similarity between the embeddings
    similarity = 1 - spatial.distance.cosine(embedded_context, embedded_question)
    similarities.append(similarity)
    
print("Similarities: ", similarities)
print("\n\nSelected Context: \n" + texts[np.argmax(similarities)])

In [None]:
# First we spin up the Qdrant server
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, Batch

q_client = QdrantClient(path="db")

q_client.recreate_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=128, distance=Distance.COSINE),
)



In [None]:
# Let's create embeddings for each of the texts and store them in a list
embeddings = []
for text in texts:
    # TODO: embed the texts
    embeddings.append(client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(text), representation=SemanticRepresentation.Document, compress_to_size=128), model="luminous-base").embedding)
    
    
# now we can upsert the data into Qdrant
ids = list(range(len(texts)))
payloads = [{"text": text} for text in texts]

q_client.upsert(
     collection_name="test_collection",
     points=Batch(
     ids=ids,
     payloads=payloads,
     vectors=embeddings
     )
)

In [None]:

# TODO write a function that takes a question and returns an answer by searching in the Qdrant database
def search_and_answer(question):
    # TODO First we embed the question
    embedded_question = client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(question), representation=SemanticRepresentation.Query, compress_to_size=128), model="luminous-base").embedding
    
    # Then we search for the most similar text
    search_result = q_client.search(
        collection_name="test_collection",
        query_vector=embedded_question,
        filter=None,
        top=1,
    )
    
    print(search_result)
        
    # return "no answer found" if no result has a score above 0.3
    if search_result[0].score < 0.3:
        return "no answer found"
    
    
    # Then we get the text from the search result
    text = search_result[0].payload["text"]
    
    # TODO Finally we ask luminous to answer the question based on the text
    prompt = f"""### Instructions: Answer the question briefly based on the provided Input.
    
    ### Input: {text}
    
    ### Question: {question}
    
    ### Response:"""
    
    # TODO write the CompletionRequest
    request = CompletionRequest(prompt=Prompt.from_text(prompt), maximum_tokens=64, stop_sequences=["###"])
    
    # TODO get the response from luminous
    response = client.complete(request, model="luminous-extended-control")
    
    return response.completions[0].completion

In [None]:
search_and_answer("When was the cooperation agreement signed?")