# Uso de modelos de embeddings de Hugging Face: Sentence Transformers

## Instalación y carga de librerías

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


## Cargar dataset

In [2]:
df_avatar = pd.read_csv('data/atla-episodes-scripts.csv')
df_avatar.head(10)

Unnamed: 0,Character,script,ep_number,Book,total_number
0,,"As the title card fades, the scene opens onto ...",1,1,1
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1
5,,"Behind Sokka, Katara is still making circular ...",1,1,1
6,Katara,[Struggling with the water that passes right i...,1,1,1
7,,The bubble containing her fish slowly drifts a...,1,1,1
8,Katara,[Exclaims indignantly.] Hey!,1,1,1
9,,"As Sokka lets out a gasp of discomfort, the fi...",1,1,1


# Sentences Transformers

In [3]:
sentences = ['este es el primer ejemplo', 'y este es el segundo ejemplo']
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
embeddings.shape

(2, 384)

In [9]:
model.max_seq_length

256

## Aplicar

In [10]:
embeddings = model.encode(df_avatar['script'], batch_size=64, show_progress_bar=True)

Batches: 100%|██████████| 209/209 [10:04<00:00,  2.89s/it]


## Guardando a dataset

In [11]:
df_avatar['embeddings'] = embeddings.tolist()
df_avatar.head()

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings
0,,"As the title card fades, the scene opens onto ...",1,1,1,"[-0.07994643598794937, 0.08622030913829803, 0...."
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1,"[0.01779160276055336, 0.05984478443861008, -0...."
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1,"[-0.022184530273079872, 0.056840650737285614, ..."
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1,"[-0.01815219037234783, 0.11454901844263077, 0...."
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1,"[-0.013881645165383816, 0.08426922559738159, -..."


In [12]:
df_avatar.to_csv('data/atla-episodes-scripts-embeddings.csv', index=False)

## Searching

In [18]:
query = 'I am the avatar'
query_embedding = model.encode([query])

In [19]:
df_avatar['cosine_similarity'] = df_avatar['embeddings'].apply(lambda x: util.cos_sim(x, query_embedding[0]))
df_avatar.head()

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings,cosine_similarity
0,,"As the title card fades, the scene opens onto ...",1,1,1,"[-0.07994643598794937, 0.08622030913829803, 0....",[[tensor(0.1986)]]
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1,"[0.01779160276055336, 0.05984478443861008, -0....",[[tensor(0.1636)]]
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1,"[-0.022184530273079872, 0.056840650737285614, ...",[[tensor(0.1229)]]
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1,"[-0.01815219037234783, 0.11454901844263077, 0....",[[tensor(0.1815)]]
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1,"[-0.013881645165383816, 0.08426922559738159, -...",[[tensor(0.1350)]]


In [20]:
df_avatar.sort_values(by='cosine_similarity', ascending=False).head(10)

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings,cosine_similarity
5489,Huu,You're the Avatar. You tell me.,4,2,24,"[0.001832458539865911, 0.05343332514166832, 0....",[[tensor(0.7336)]]
2793,Zuko,The Avatar!,12,1,12,"[-0.058731887489557266, 0.09145893901586533, -...",[[tensor(0.7222)]]
1586,Zuko,The Avatar!,7,1,7,"[-0.058731887489557266, 0.09145893901586533, -...",[[tensor(0.7222)]]
1713,Aang,Great! I am the Avatar!,8,1,8,"[-0.0034094632137566805, 0.006402341648936272,...",[[tensor(0.7042)]]
5476,Huu,The Avatar? Come with me.,4,2,24,"[-0.043565623462200165, 0.06249752268195152, 0...",[[tensor(0.7001)]]
7398,Aang,I'm the Avatar. Take me to whoever is in charge.,13,2,33,"[-0.04834688454866409, 0.06524800509214401, 0....",[[tensor(0.6720)]]
11667,Aang,But once they find out I'm the Avatar ...,13,3,53,"[-0.01904081553220749, 0.047527577728033066, 0...",[[tensor(0.6640)]]
8474,Kuei,The Avatar? [Points at Sokka.] You're the Avatar?,18,2,38,"[-0.005035285372287035, 0.08479861170053482, -...",[[tensor(0.6522)]]
3958,Qin,The Avatar ...,17,1,17,"[-0.030843570828437805, 0.07783844321966171, 0...",[[tensor(0.6494)]]
12622,Actress Azula,The Avatar is no more!,17,3,57,"[-0.01802387274801731, 0.03962308540940285, 0....",[[tensor(0.6387)]]


In [22]:
import cohere
co = cohere.Client('API KEY') # This is your trial API key

response = co.embed(
  model='embed-multilingual-v3.0',
  texts=['I am the avatar'],
  input_type='classification',
  truncate='NONE'
)

response



In [25]:
response.embeddings[0]

[0.013008118,
 0.03857422,
 -0.031677246,
 0.0309906,
 0.004638672,
 0.012039185,
 0.010894775,
 -0.043273926,
 -0.033203125,
 0.040985107,
 0.017196655,
 -0.04849243,
 0.058563232,
 0.016448975,
 0.051483154,
 -0.0064468384,
 -0.017608643,
 0.004333496,
 -0.006866455,
 0.02885437,
 0.011779785,
 0.016555786,
 0.0029621124,
 -0.019226074,
 -0.019943237,
 0.054656982,
 0.039093018,
 -0.06085205,
 -0.0048065186,
 -0.04650879,
 -0.03842163,
 0.003929138,
 0.04058838,
 0.03253174,
 0.0044555664,
 0.024047852,
 0.023162842,
 -0.016296387,
 0.004600525,
 0.04019165,
 0.060638428,
 -0.013893127,
 -0.025039673,
 0.023651123,
 0.018417358,
 0.03451538,
 -0.011947632,
 -0.016281128,
 0.013343811,
 0.042755127,
 0.017944336,
 0.009803772,
 -0.007835388,
 0.015930176,
 0.0035381317,
 -0.0013494492,
 -0.010528564,
 -0.038269043,
 0.05505371,
 0.011680603,
 0.005039215,
 -0.03086853,
 -0.05444336,
 -0.0014925003,
 0.035003662,
 0.004096985,
 0.038330078,
 -0.024963379,
 0.030563354,
 0.013153076,
 -