In [1]:
from aleph_alpha_client import Client, SemanticEmbeddingRequest, SemanticEmbeddingResponse, SemanticRepresentation, Prompt, TextControl
from scipy import spatial
import numpy as np
import os
from dotenv import load_dotenv

from langchain.llms import AlephAlpha
from langchain.embeddings import AlephAlphaSymmetricSemanticEmbedding, AlephAlphaAsymmetricSemanticEmbedding

In [2]:
load_dotenv()

client = Client(token=os.getenv("AA_TOKEN"))

#### Step 1: Use the API to create an embedding of a text
Use the "Semantic Search" API to create an embedding of a text. The API returns a JSON object with the embedding of the text.
You can read more about the semantic search API here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed

#### Step 2: Use the API to find out how similar these three texts are
Use scipys cosine_similarity 'spatial.distance.cosine' to find out how similar these three texts are. The function returns a number between 0 and 1, where 0 means the texts are completely different and 1 means the texts are identical.
Remember that cosine_similarity returns a distance, not a similarity. So you need to subtract the distance from 1 to get the similarity.

In [4]:
texts = ["The sun is shining", 
         "It's pretty sunny today", 
         "Her smile shines brightly down upon the south african people"
         ]

In [5]:
# TODO: create an embedding request (Symmetric), get the response, and extract the embedding
text_embeddings = []
for text in texts:
    embedding_request = SemanticEmbeddingRequest(prompt=Prompt.from_text(text), representation=SemanticRepresentation.Symmetric, compress_to_size=128)
    embedding_response = client.semantic_embed(embedding_request, model="luminous-base")
    embedding = embedding_response.embedding
    text_embeddings.append(embedding)

# TODO: Calculate the cosine similarity between the embeddings
similarity_1_2 = 1 - spatial.distance.cosine(text_embeddings[0], text_embeddings[1])
similarity_1_3 = 1 - spatial.distance.cosine(text_embeddings[0], text_embeddings[2])
similarity_2_3 = 1 - spatial.distance.cosine(text_embeddings[1], text_embeddings[2])

print(f"Similarity between {texts[0][:10]} and {texts[1][:10]}", similarity_1_2)
print(f"Similarity between text {texts[0][:10]} and {texts[2][:10]}: ", similarity_1_3)
print(f"Similarity between text {texts[1][:10]} and {texts[2][:10]}: ", similarity_2_3)

Similarity between The sun is and It's prett 0.822170248199528
Similarity between text The sun is and Her smile :  0.3074451358909682
Similarity between text It's prett and Her smile :  0.1735522296901626


In [6]:
# semantic similarity with langchain    
embeddings = AlephAlphaSymmetricSemanticEmbedding()

text_embeddings = embeddings.embed_documents(texts)

for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity = 1 - spatial.distance.cosine(text_embeddings[i], text_embeddings[j])
        print(f"Similarity between {texts[i][:10]} and {texts[j][:10]}", similarity)

Similarity between The sun is and It's prett 0.8222216781668
Similarity between The sun is and Her smile  0.30750528534059873
Similarity between It's prett and Her smile  0.17384478378490975


#### Step 3: Use the API on an asymmetric embedding case to find the answer to the question
Asymmetric embeddings are useful when you want to find the answer to a question. For example, if you want to find the answer to the question "What is the capital of France?", you can use the API to create an embedding of the question and an embedding of the answer. Then you can use the cosine_similarity function to find out how similar the question and the answer are. The answer is the one with the highest similarity.

We will try this on parts of the manual.

You can find the documentation on the asymmetric embedding here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/#code-example

In [7]:
# Read the data in the data.md file
with open("data.md", "r") as f:
    data = f.read()
    
# Split the data into a list of texts
texts = data.split("#####")

In [9]:
# Loading in the text files
question = "What countries have social elements in their guidelines?"

In [10]:
# TODO: create embeddings (Document) for the contexts and the question (Query)
embedded_contexts = []
for context in texts:
    # TODO: create an embedding request (Document), 
    # get the response, 
    # and extract the embedding
    embedding_request = SemanticEmbeddingRequest(
        prompt=Prompt.from_text(context), 
        representation=SemanticRepresentation.Document, 
        compress_to_size=128)
    embedding_response = client.semantic_embed(embedding_request, model="luminous-base")
    embedding = embedding_response.embedding
    embedded_contexts.append(embedding)

# TODO
# create an embedding request (Query), 
# get the response, 
# and extract the embedding
embedded_question = client.semantic_embed(
    SemanticEmbeddingRequest(
        prompt=Prompt.from_text(question), 
        representation=SemanticRepresentation.Query, 
        compress_to_size=128), 
    model="luminous-base").embedding



In [11]:
# TODO: create embeddings (Document) for the contexts and the question (Query) with langchain

# Load the embedding model
embeddings = AlephAlphaAsymmetricSemanticEmbedding()

# TODO: create embeddings (Document) for the contexts and the question (Query)
embedded_contexts = embeddings.embed_documents(texts)
embedded_question = embeddings.embed_query(question)

In [12]:
# TODO: Calculate the cosine similarity between the embeddings
similarities = []
for embedded_context in embedded_contexts:
    # TODO: Calculate the cosine similarity between the embeddings
    similarity = 1 - spatial.distance.cosine(embedded_context, embedded_question)
    similarities.append(similarity)
    
print("Similarities: ", similarities)
print("\n\nSelected Context: \n" + texts[np.argmax(similarities)])

Similarities:  [0.48128268240327854, 0.4867165345669583, 0.41384206688658354, 0.46313856409885634, 0.5188248431308402, 0.425083909884707, 0.4523307108843164, 0.37041205042762737, 0.4814904306643404, 0.4575652064311403, 0.4894331790145512, 0.41141384387662094]


Selected Context: 
 POLICY INTEGRATION

POLICY INTEGRATION – **15**

Sustainable development has been defined a variety of ways, but in practice it has come to mean development that achieves a balance among economic, environmental and social objectives for both present and future generations. The integration of the three dimensions of sustainable development is one of the most difficult balances to achieve in formulating a national strategy. In practice, most national strategies have a greater focus on environmental issues with some attempts to incorporate economic aspects. The social pillar has been the most neglected. As a result, few national strategies develop abilities for considering and making trade-offs among the three a