In [None]:
from aleph_alpha_client import Client, SemanticEmbeddingRequest, SemanticEmbeddingResponse, SemanticRepresentation, Prompt
from scipy import spatial
import numpy as np
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

client = Client(token=os.getenv("AA_TOKEN"))

#### Step 1: Use the API to create an embedding of a text
Use the "Semantic Search" API to create an embedding of a text. The API returns a JSON object with the embedding of the text.
You can read more about the semantic search API here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed

In [None]:
text_to_embed_1 = "The sun is shining"
text_to_embed_2 = "Die Sonne scheint"
text_to_embed_3 = "Aleph Alpha develops and delivers technology solutions that help customers around the world use AI technology."

# TODO: create an embedding request (Symmetric), get the response, and extract the embedding
embedded_text_1 = None 
embedded_text_2 = None
embedded_text_3 = None

##### Solution

    ```python
    embedded_text_1 = client.semantic_embed(SemanticEmbeddingRequest(prompt=text_to_embed_1, representation=SemanticRepresentation.Symmetric, compress_to_size=128)).embedding
    embedded_text_2 = client.semantic_embed(SemanticEmbeddingRequest(prompt=text_to_embed_1, representation=SemanticRepresentation.Symmetric, compress_to_size=128)).embedding
    embedded_text_3 = client.semantic_embed(SemanticEmbeddingRequest(prompt=text_to_embed_1, representation=SemanticRepresentation.Symmetric, compress_to_size=128)).embedding
    ```

#### Step 2: Use the API to find out how similar these three texts are
Use scipys cosine_similarity 'spatial.distance.cosine' to find out how similar these three texts are. The function returns a number between 0 and 1, where 0 means the texts are completely different and 1 means the texts are identical.
Remember that cosine_similarity returns a distance, not a similarity. So you need to subtract the distance from 1 to get the similarity.

In [None]:
# TODO: Calculate the cosine similarity between the embeddings
similarity_1_2 = None
similarity_1_3 = None
similarity_2_3 = None
print("Similarity between text 1 and 2: ", similarity_1_2)
print("Similarity between text 1 and 3: ", similarity_1_3)
print("Similarity between text 2 and 3: ", similarity_2_3)

##### Solution

    ```python
    similarity_1_2 = 1 - spatial.distance.cosine(embedded_text_1, embedded_text_2)
    similarity_1_3 = 1 - spatial.distance.cosine(embedded_text_1, embedded_text_3)
    print("Similarity between text 1 and 2: ", similarity_1_2)
    print("Similarity between text 1 and 3: ", similarity_1_3)
    ```

#### Step 3: Use the API on an asymmetric embedding case to find the answer to the question
Asymmetric embeddings are useful when you want to find the answer to a question. For example, if you want to find the answer to the question "What is the capital of France?", you can use the API to create an embedding of the question and an embedding of the answer. Then you can use the cosine_similarity function to find out how similar the question and the answer are. The answer is the one with the highest similarity.

We will try this on parts of the manual.

You can find the documentation on the asymmetric embedding here: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/#code-example

In [None]:
# Loading in the text files
contexts = ["Germany : Germany (, ), officially the Federal Republic of Germany, is a country in Central Europe. It is the second most populous country in Europe after Russia, and the most populous member state of the European Union. Germany is situated between the Baltic and North seas to the north, and the Alps to the south; it covers an area of , with a population of over 83 million within its 16 constituent states. Germany borders Denmark to the north, Poland and the Czech Republic to the east, Austria and Switzerland to the south, and France, Luxembourg, Belgium, and the Netherlands to the west. The nation's capital and largest city is Berlin, and its financial centre is Frankfurt; the largest urban area is the Ruhr.",
            "Bristol : Bristol () is a city, ceremonial county and unitary authority in England. Situated on the River Avon, it is bordered by the ceremonial counties of Gloucestershire, to the north; and Somerset, to the south. Bristol is the most populous city in South West England.\nThe wider Bristol Built-up Area is the eleventh most populous urban area in the United Kingdom.",
            "Heidelberg : Heidelberg () is a university town in the German state of Baden-W\u00fcrttemberg, situated on the river Neckar in south-west Germany. In the 2016 census, its population was 159,914, of which roughly a quarter consisted of students.",
            "France : France (), officially the French Republic (), is a transcontinental country spanning Western Europe and overseas regions and territories in the Americas and the Atlantic, Pacific and Indian Oceans. Its metropolitan area extends from the Rhine to the Atlantic Ocean and from the Mediterranean Sea to the English Channel and the North Sea; overseas territories include French Guiana in South America, Saint Pierre and Miquelon in the North Atlantic, the French West Indies, and many islands in Oceania and the Indian Ocean. Due to its several coastal territories, France has the largest exclusive economic zone in the world. France borders Belgium, Luxembourg, Germany, Switzerland, Monaco, Italy, Andorra and Spain in Europe, as well as the Netherlands, Suriname and Brazil in the Americas. Its eighteen integral regions (five of which are overseas) span a combined area of  and over 67 million people (). France is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural and commercial centre; other major urban areas include Marseille, Lyon, Toulouse, Lille, Bordeaux, and Nice."
            ]
        
question = "What city is located at the river Neckar?"

In [None]:
# TODO: create embeddings (Document) for the contexts and the question (Query)
embedded_contexts = []
for context in contexts:
    pass
embedded_question = None

#### Solution

```
embedded_contexts = []
for context in contexts:
    embedded_contexts.append(client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(context), representation=SemanticRepresentation.Document, compress_to_size=128), model="luminous-base").embedding)

embedded_question = client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(question), representation=SemanticRepresentation.Query, compress_to_size=128), model="luminous-base").embedding
```

In [None]:
# TODO: Calculate the cosine similarity between the embeddings
similarities = []
for embedded_context in embedded_contexts:
    pass
    
print("Similarities: ", similarities)

#### Step 4: Let's try to split the third text into sections and find the answer to the question
The third text is a bit longer than the other two texts. Let's try to split the text into sections and find the answer to the question.
The splitting is already done for you. You just need to create an embedding of each section and find the section with the highest similarity to the question.

In [None]:
splitted_context = contexts[2].split("###")
# remove small splits
splits = []
for split in splitted_context:
    if len(split) > 30:
        splits.append(split)

embedded_splits = []
for split in splits:
    pass
    
similarities = []
for embedded_split in embedded_splits:
    pass

# print out the text of the most similar context
print("Most similar context: ", splits[np.argmax(similarities)])

#### Solution


```
splitted_context = contexts[2].split("###")
# remove small splits
splits = []
for split in splitted_context:
    if len(split) > 30:
        splits.append(split)

embedded_splits = []
for split in splits:
    embedded_splits.append(client.semantic_embed(SemanticEmbeddingRequest(prompt=Prompt.from_text(split), representation=SemanticRepresentation.Document, compress_to_size=128), model="luminous-base").embedding)
    
similarities = []
for embedded_split in embedded_splits:
    similarities.append(1 - spatial.distance.cosine(embedded_question, embedded_split))

# print out the text of the most similar context
print("Most similar context: ", splits[np.argmax(similarities)])
```