In [None]:
#@title Setup
!pip install -q cohere umap-learn altair annoy datasets tqdm

[K     |████████████████████████████████| 88 kB 2.2 MB/s 
[K     |████████████████████████████████| 647 kB 9.4 MB/s 
[K     |████████████████████████████████| 451 kB 52.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 26.2 MB/s 
[K     |████████████████████████████████| 212 kB 19.4 MB/s 
[K     |████████████████████████████████| 132 kB 21.4 MB/s 
[K     |████████████████████████████████| 182 kB 52.2 MB/s 
[K     |████████████████████████████████| 127 kB 51.6 MB/s 
[?25h  Building wheel for cohere (setup.py) ... [?25l[?25hdone
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone


In [None]:
#@title Imports

import os
import json
from time import time

import umap
import torch
import cohere
import warnings
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import torch.nn.functional as F

from typing import List, Union, Dict, Any


warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [None]:
#@title Insert your Cohere's API Key

#@markdown If you don't have an API Key yet, please generate one from here https://os.cohere.ai/

COHERE_API_KEY = None  #@param {type:"raw"}
co = cohere.Client(COHERE_API_KEY)

In [None]:
#@title Get Embeddings helper function

model_name = 'multilingual-22-12' #@param ["multilingual-22-12", "small", "large"]
def get_embeddings(co: cohere.Client, model_name: str, texts: List[str], truncate: str = "RIGHT"):
    output = co.embed(model=model_name, texts=texts, truncate=truncate)
    return output.embeddings

In [None]:
#@title Get UMAP helper function
# UMAP is a general purpose manifold learning and dimension reduction algorithm.
n_neighbors = 15 #@param {type:"slider", min:1, max:100, step:1}
def get_umap(embeddings: Union[List, np.array], n_neighbors: int = 15):
    reducer = umap.UMAP(n_neighbors=n_neighbors)
    umap_embeddings = reducer.fit_transform(embeddings)
    return umap_embeddings

In [None]:
#@title Generating embeddings space chart function

def generate_chart(df: pd.DataFrame,
                   *,
                   xcol: str,
                   ycol: str,
                   lbl: str = 'on',
                   color: str = 'basic',
                   title: str = '',
                   tooltip: List[str] = ['']) -> alt.Chart:
    alt.data_transformers.enable('default', max_rows=None)
    chart = alt.Chart(df).mark_circle(size=250).encode(
        x=alt.X(xcol, scale=alt.Scale(zero=False), axis=alt.Axis(labels=False, ticks=False, domain=False)),
        y=alt.Y(ycol, scale=alt.Scale(zero=False), axis=alt.Axis(labels=False, ticks=False, domain=False)),
        color=alt.value('#333293') if color == 'basic' else color,
        tooltip=tooltip,
        text='texts',
    )
    text = chart.mark_text(
        align='left',
        baseline='middle',
        dx=7
    ).encode(
        text='texts'
    )

    result = (chart + text).configure(background="#FDF7F0").properties(width=1200, height=800,
                                                              title=title).configure_legend(orient='bottom',
                                                                                            titleFontSize=18,
                                                                                            labelFontSize=18)
    return result.interactive()

In [None]:
#@title Your dataset

df = pd.DataFrame({
    "texts": [
        'Hello from Cohere!', 'مرحبًا من كوهير!', 'Hallo von Cohere!', 'Bonjour de Cohere!', '¡Hola desde Cohere!',
        'Olá do Cohere!', 'Ciao da Cohere!', '您好，来自 Cohere！', '안녕하세요 코히어입니다!', 'कोहेरे से नमस्ते!',
        'Cohere provides the best multilingual models in the world', 'كوهير توفر أفضل النماذج متعددة اللغات في العالم',
        'Cohere bietet die besten mehrsprachigen Modelle der Welt',
        'Cohere fournit les meilleurs modèles multilingues au monde',
        'Cohere fornisce i migliori modelli multilingue del mondo',
        'Cohere ofrece los mejores modelos multilingües del mundo',
        'Cohere fornece os melhores modelos multilíngues do mundo', 'Cohere는 세계 최고의 다국어 모델을 제공합니다.',
        'Cohere 提供世界上最好的多语言模型', 'Cohere दुनिया में सर्वश्रेष्ठ बहुभाषी मॉडल प्रदान करता है'
    ]
})
df['Source'] = 'Existing'
df

Unnamed: 0,texts,Source
0,Hello from Cohere!,Existing
1,مرحبًا من كوهير!,Existing
2,Hallo von Cohere!,Existing
3,Bonjour de Cohere!,Existing
4,¡Hola desde Cohere!,Existing
5,Olá do Cohere!,Existing
6,Ciao da Cohere!,Existing
7,您好，来自 Cohere！,Existing
8,안녕하세요 코히어입니다!,Existing
9,कोहेरे से नमस्ते!,Existing


In [None]:
embeddings = get_embeddings(co=co, model_name=model_name, texts=df.texts.tolist())

In [None]:
embeddings_pca = get_umap(embeddings=embeddings)
df['x'] = embeddings_pca[:, 0]
df['y'] = embeddings_pca[:, 1]

In [None]:
alt_chart = generate_chart(
    df=df,
    xcol='x',
    ycol='y',
    tooltip=['texts', 'x', 'y'],
    color='Source',
    title=f"Cohere's `{model_name}` model Embeddings")
alt_chart

In [None]:
#@title New Text
# Japanese 
df2 = df.copy()
new_text = 'コヒーレからこんにちは' #@param {type:"string"}
df2.loc[len(df2.index)] = [new_text, None, None, None] 

In [None]:
#@title coloring new text
df2['Source'] = 'Existing'
df2.at[len(df2) - 1, 'Source'] = "New"

In [None]:
new_text_embeddings = get_embeddings(co=co, model_name=model_name, texts=[df2.loc[len(df2.index)-1]['texts']])[0]

In [None]:
embeddings2 = embeddings.copy()
embeddings2.append(new_text_embeddings)

In [None]:
embeddings_pca = get_umap(embeddings=embeddings2)
df2['x'] = embeddings_pca[:, 0]
df2['y'] = embeddings_pca[:, 1]

In [None]:
alt_chart = generate_chart(
    df=df2,
    xcol='x',
    ycol='y',
    tooltip=['texts', 'x', 'y'],
    color='Source',
    title=f"Cohere's `{model_name}` model Embeddings")
alt_chart

---
---

# Semantic Search

In [None]:
#@title Semantic similarity helper function

torchfy = lambda x: torch.as_tensor(x, dtype=torch.float32)

def get_similarity(target: List[float], candidates: List[float], top_k: int):
    candidates = torchfy(candidates).transpose(0, 1) # shape (768, bs)
    target = torchfy(target) # shape (1, 768)
    dot_scores = torch.mm(target, candidates)

    scores, indices = torch.topk(dot_scores, k=top_k)
    similarity_hits = [{'id': idx, 'score': score} for idx, score in zip(indices[0].tolist(), scores[0].tolist())]

    return similarity_hits

In [None]:
#@title Let's search and visualize
df3 = df.copy()
query_text = "\u0627\u0647\u0644\u0627 \u0648\u0633\u0647\u0644\u0627" #@param {type:"string"}
df3.loc[len(df3.index)] = [query_text, None, None, None] 
query_embeddings = get_embeddings(co=co, model_name=model_name, texts=[query_text])

In [None]:
top_k: int = 5 #@param {type:"slider", min:1, max:100, step:5}
embeddings3 = embeddings.copy()
similarity_hits = get_similarity(target=query_embeddings, candidates=embeddings3, top_k=top_k)

In [None]:
embeddings3.append(query_embeddings[0])
embeddings_pca = get_umap(embeddings=embeddings3)
df3['x'] = embeddings_pca[:, 0]
df3['y'] = embeddings_pca[:, 1]

In [None]:
df3['Source'] = 'Existing'
df3.at[len(df3) - 1, 'Source'] = "New"
for index in range(len(similarity_hits)):
    hit = similarity_hits[index]
    df3.at[hit['id'], 'Source'] = "Similar"

In [None]:
alt_chart = generate_chart(
    df=df3,
    xcol='x',
    ycol='y',
    tooltip=['texts', 'x', 'y'],
    color='Source',
    title=f"Cohere's `{model_name}` model Embeddings")
alt_chart