In [1]:
!pip install transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.9.0.post1-cp312-cp312-win_amd64.whl (13.8 MB)
   ---------------------------------------- 0.0/13.8 MB ? eta -:--:--
   ------- -------------------------------- 2.6/13.8 MB 12.6 MB/s eta 0:00:01
   ------------------ --------------------- 6.3/13.8 MB 16.1 MB/s eta 0:00:01
   --------------------- ------------------ 7.6/13.8 MB 12.4 MB/s eta 0:00:01
   ---------------------------- ----------- 9.7/13.8 MB 11.6 MB/s eta 0:00:01
   ------------------------------- -------- 10.7/13.8 MB 12.4 MB/s eta 0:00:01
   ---------------------------------------  13.6/13.8 MB 11.1 MB/s eta 0:00:01
   ---------------------------------------- 13.8/13.8 MB 10.7 MB/s eta 0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


# Embeddings BERT

O que são embeddings? Como eles se formam?

In [6]:
from transformers import BertModel, BertTokenizer
import torch
import faiss

In [7]:
# Tudo começa com a tokenização de uma sentença
sentence = "The world is full of kings and queens Who blind your eyes and steal your dreams It is Heaven and Hell"

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer_output = tokenizer.tokenize(sentence)
print(tokenizer_output)

['the', 'world', 'is', 'full', 'of', 'kings', 'and', 'queens', 'who', 'blind', 'your', 'eyes', 'and', 'steal', 'your', 'dreams', 'it', 'is', 'heaven', 'and', 'hell']


In [8]:
tokens_embedding = tokenizer.convert_tokens_to_ids(tokenizer_output)
print(tokens_embedding)

[1996, 2088, 2003, 2440, 1997, 5465, 1998, 8603, 2040, 6397, 2115, 2159, 1998, 8954, 2115, 5544, 2009, 2003, 6014, 1998, 3109]


In [9]:
decoded_content = tokenizer.decode(tokens_embedding)
print(decoded_content)

the world is full of kings and queens who blind your eyes and steal your dreams it is heaven and hell


In [10]:
# Tudo começa com a geração de tokens
from transformers import  AutoTokenizer, AutoModel
import json
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(sentence)
print(f"""Tokens: {len(tokens["input_ids"])}""")
tokens

Tokens: 23


{'input_ids': [101, 1996, 2088, 2003, 2440, 1997, 5465, 1998, 8603, 2040, 6397, 2115, 2159, 1998, 8954, 2115, 5544, 2009, 2003, 6014, 1998, 3109, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
# Utilizando o modelo para gerar o embedding
model = BertModel.from_pretrained('bert-base-uncased')
batch_dict = tokenizer([sentence], max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2174,  0.0724, -0.1786,  ..., -0.4340,  0.2995, -0.0262],
         [-0.1075, -0.0431, -0.0405,  ..., -0.3530,  1.1015, -0.3895],
         [ 0.2437,  0.5879,  0.3414,  ..., -0.3289,  0.7832, -0.0433],
         ...,
         [-0.1418,  0.2034,  0.4530,  ..., -0.0011,  0.1981,  0.2041],
         [ 0.2908,  1.0010, -0.4443,  ...,  0.0089,  0.9333, -1.0749],
         [ 1.0551,  0.2792, -0.2750,  ..., -0.7832, -0.4835, -0.2894]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8839, -0.5448, -0.7915,  0.8322,  0.4858, -0.3150,  0.7567,  0.3539,
         -0.6039, -1.0000, -0.3889,  0.9384,  0.9896,  0.2382,  0.9135, -0.2738,
         -0.2841, -0.6850,  0.4592, -0.0880,  0.7643,  0.9998,  0.2091,  0.3312,
          0.5463,  0.9807, -0.6260,  0.9236,  0.9671,  0.7825, -0.5395,  0.3884,
         -0.9943, -0.3197, -0.8965, -0.9950,  0.4751, -0.6427, -0.0040, -0.0376,
         -0.8692,  0.4170,  0.99

In [8]:
outputs[1]

tensor([[-0.8839, -0.5448, -0.7915,  0.8322,  0.4858, -0.3150,  0.7567,  0.3539,
         -0.6039, -1.0000, -0.3889,  0.9384,  0.9896,  0.2382,  0.9135, -0.2738,
         -0.2841, -0.6850,  0.4592, -0.0880,  0.7643,  0.9998,  0.2091,  0.3312,
          0.5463,  0.9807, -0.6260,  0.9236,  0.9671,  0.7825, -0.5395,  0.3884,
         -0.9943, -0.3197, -0.8965, -0.9950,  0.4751, -0.6427, -0.0040, -0.0376,
         -0.8692,  0.4170,  0.9999,  0.1054,  0.5376, -0.2332, -1.0000,  0.4504,
         -0.8423,  0.6404,  0.6801,  0.6699,  0.3162,  0.5598,  0.4826, -0.1857,
          0.0568,  0.3254, -0.3158, -0.6538, -0.6374,  0.5953, -0.7156, -0.8618,
          0.8294,  0.4793, -0.3167, -0.3143, -0.1983, -0.0491,  0.7629,  0.3354,
          0.1407, -0.8734,  0.3438,  0.4445, -0.4116,  1.0000, -0.3030, -0.9832,
          0.5743,  0.4244,  0.3688,  0.2458, -0.1296, -1.0000,  0.5171, -0.1968,
         -0.9914,  0.4429,  0.3992, -0.2937,  0.0439,  0.3861, -0.1621, -0.4907,
         -0.4489, -0.6888, -

### Função para extrair o pooling da sentença

In [9]:
# Pooling do resultado
import torch.nn.functional as F
from torch import Tensor
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [10]:
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings

tensor([[ 3.0860e-01,  4.0486e-01,  2.5230e-01, -1.3755e-01,  1.2081e-01,
          1.0385e-01,  2.9575e-01,  9.8069e-01, -4.8630e-01, -5.4293e-01,
          3.1609e-01, -2.7550e-01, -1.5541e-01,  5.3723e-01, -3.1088e-01,
          3.7762e-01,  3.1345e-01,  2.3856e-01, -3.2789e-02,  2.6841e-01,
          3.0620e-01,  2.6845e-02, -4.4528e-02,  2.0542e-01,  5.1280e-01,
          2.6570e-01,  2.5755e-02,  2.2840e-01, -3.7274e-02, -8.3962e-02,
          6.0879e-02,  4.1199e-01, -6.4230e-02, -4.2746e-01,  1.6147e-01,
         -1.8045e-02, -9.0395e-02, -2.7217e-01, -1.9688e-01,  6.2457e-01,
         -5.9612e-01, -2.4330e-01,  7.6740e-02, -7.4108e-02, -1.0950e-01,
         -4.4042e-01,  3.3249e-01, -6.5951e-02,  5.8267e-01,  1.4368e-01,
         -5.7391e-02,  3.0068e-01, -3.2195e-01, -2.0049e-01,  4.9362e-01,
          8.4186e-01,  2.0295e-01, -2.9756e-01, -5.3815e-01, -3.7432e-02,
          2.9172e-01,  1.1888e-01,  1.8524e-02, -2.8099e-01,  1.0474e-01,
          6.3735e-01,  1.3159e-01,  5.

In [11]:
# normalize embeddings
embeddings_norm = F.normalize(embeddings, p=2, dim=1)
embeddings_norm

tensor([[ 3.0101e-02,  3.9491e-02,  2.4610e-02, -1.3417e-02,  1.1784e-02,
          1.0130e-02,  2.8848e-02,  9.5659e-02, -4.7434e-02, -5.2958e-02,
          3.0832e-02, -2.6872e-02, -1.5159e-02,  5.2402e-02, -3.0324e-02,
          3.6834e-02,  3.0574e-02,  2.3269e-02, -3.1983e-03,  2.6181e-02,
          2.9867e-02,  2.6185e-03, -4.3434e-03,  2.0037e-02,  5.0020e-02,
          2.5917e-02,  2.5122e-03,  2.2279e-02, -3.6358e-03, -8.1899e-03,
          5.9383e-03,  4.0186e-02, -6.2651e-03, -4.1695e-02,  1.5750e-02,
         -1.7601e-03, -8.8173e-03, -2.6548e-02, -1.9204e-02,  6.0921e-02,
         -5.8147e-02, -2.3732e-02,  7.4854e-03, -7.2286e-03, -1.0681e-02,
         -4.2960e-02,  3.2431e-02, -6.4330e-03,  5.6835e-02,  1.4014e-02,
         -5.5980e-03,  2.9329e-02, -3.1404e-02, -1.9557e-02,  4.8148e-02,
          8.2117e-02,  1.9796e-02, -2.9024e-02, -5.2493e-02, -3.6512e-03,
          2.8455e-02,  1.1596e-02,  1.8069e-03, -2.7409e-02,  1.0217e-02,
          6.2168e-02,  1.2835e-02,  5.

In [12]:
embeddings_norm[0]

tensor([ 3.0101e-02,  3.9491e-02,  2.4610e-02, -1.3417e-02,  1.1784e-02,
         1.0130e-02,  2.8848e-02,  9.5659e-02, -4.7434e-02, -5.2958e-02,
         3.0832e-02, -2.6872e-02, -1.5159e-02,  5.2402e-02, -3.0324e-02,
         3.6834e-02,  3.0574e-02,  2.3269e-02, -3.1983e-03,  2.6181e-02,
         2.9867e-02,  2.6185e-03, -4.3434e-03,  2.0037e-02,  5.0020e-02,
         2.5917e-02,  2.5122e-03,  2.2279e-02, -3.6358e-03, -8.1899e-03,
         5.9383e-03,  4.0186e-02, -6.2651e-03, -4.1695e-02,  1.5750e-02,
        -1.7601e-03, -8.8173e-03, -2.6548e-02, -1.9204e-02,  6.0921e-02,
        -5.8147e-02, -2.3732e-02,  7.4854e-03, -7.2286e-03, -1.0681e-02,
        -4.2960e-02,  3.2431e-02, -6.4330e-03,  5.6835e-02,  1.4014e-02,
        -5.5980e-03,  2.9329e-02, -3.1404e-02, -1.9557e-02,  4.8148e-02,
         8.2117e-02,  1.9796e-02, -2.9024e-02, -5.2493e-02, -3.6512e-03,
         2.8455e-02,  1.1596e-02,  1.8069e-03, -2.7409e-02,  1.0217e-02,
         6.2168e-02,  1.2835e-02,  5.5129e-02, -7.5

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-small-v2')
input_texts = [
    sentence
]
embedding_st = model.encode(input_texts, normalize_embeddings=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
embedding_st

array([[-5.50028048e-02,  4.06924548e-04,  2.23011617e-02,
         3.27239670e-02,  7.51231192e-03,  9.82137769e-03,
         8.53741691e-02, -6.70386478e-02,  3.10061667e-02,
         7.27371424e-02,  7.53616616e-02, -1.66554824e-02,
        -9.08455532e-03,  1.01325316e-02,  7.41213001e-03,
        -2.29131803e-02, -4.11272421e-03,  6.22413447e-03,
        -1.28571197e-01,  1.55947404e-02,  8.98244902e-02,
        -3.91689278e-02,  2.78747417e-02, -4.41704728e-02,
        -5.29532507e-02, -9.28498060e-03,  2.93835327e-02,
         3.73574458e-02, -5.47523461e-02, -9.65574160e-02,
        -7.89923221e-02, -3.68110277e-02, -5.76502504e-03,
        -2.65847743e-02,  5.14036492e-02, -2.76546981e-02,
        -1.70333748e-04,  5.95797002e-02,  4.28500213e-02,
         4.19764034e-02, -2.50484254e-02, -2.47685499e-02,
         5.28421029e-02, -4.95721996e-02, -8.55890010e-03,
        -4.08470444e-02, -3.59404273e-02, -5.82969822e-02,
         9.98011827e-02,  1.78730804e-02, -4.56400886e-0

## Armazenando os Embeddings

In [2]:
# Vamos importar dados
import numpy as np
import pandas as pd
from tqdm import tqdm
data = pd.read_parquet("data.parquet")

In [3]:
df = data.sample(n=100)

In [4]:
df

Unnamed: 0,text,label
120181,i feel that i am a submissive,0
162583,i do not feel cute like those cute pregnant girls,1
149062,ive been feeling lately like my therapist is a...,3
291718,i feel like such a selfish bitch for complaini...,3
271099,i missed a week for the blog but at the moment...,1
...,...,...
149978,i was feeling pretty jealous,3
341097,im feeling this way because i am not by nature...,2
91765,im feeling generous because it made my kids happy,2
212723,i wasnt sure about i feel like there are two s...,1


In [5]:
batch_dict = tokenizer([df.iloc[0]["text"]], max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
outputs[1][0]

NameError: name 'tokenizer' is not defined

In [15]:
def generate_embedding(text):
      batch_dict = tokenizer([text], max_length=512, padding=True, truncation=True, return_tensors='pt')
      outputs = model(**batch_dict)
      embedding = outputs[1][0].detach().numpy()  # Pegar a saída do último vetor
      return embedding

In [16]:
embeddings = [generate_embedding(text) for text in df["text"]]

In [17]:
dimension = embeddings[0].shape[0]  # Dimensão do vetor
index = faiss.IndexFlatL2(dimension)  # Indexação com métrica de L2

In [18]:
# Adicionar embeddings ao índice
index.add(np.array(embeddings).astype("float32"))

# Armazenar os labels em um array
labels = df["label"].values

In [19]:
print(df.iloc[0]["text"], "(", df.iloc[0]["label"], ")")

im feeling a little bit blank from being sick ( 0 )


In [None]:
# Exemplo de consulta no FAISS
query_text = df.iloc[0]["text"]  # Exemplo de texto de consulta
query_embedding = generate_embedding(query_text).astype("float32").reshape(1, -1)

# Recuperar os k vizinhos mais próximos
k = 5
distances, indices = index.search(query_embedding, k)

In [None]:
# Exibir resultados
for i, idx in enumerate(indices[0]):
    print(f"Texto: {df.iloc[idx]['text']}, Label: {df.iloc[idx]['label']}, Distância: {distances[0][i]}")

Texto: when i met friends i had not seen for the last years, Label: 1, Distância: 0.0
Texto: i would feel angry and lost and alone in the future, Label: 3, Distância: 7.012348651885986
Texto: i have been feeling incredibly emotional, Label: 0, Distância: 7.645660877227783
Texto: i feel safe and confident about this treatment, Label: 1, Distância: 8.112236976623535
Texto: i feel for this little pound lovely is truly a gift, Label: 2, Distância: 8.96358871459961


## Master Image Embeddings & Vector Analysis

In [None]:
!pip install umap-learn



In [None]:
import umap
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)

In [None]:
def generate_embedding_2(text):
    inputs = clip_tokenizer([text], return_tensors="pt", truncation=True, padding=True)
    outputs = clip_model.get_text_features(**inputs)
    embedding = outputs.cpu().numpy().flatten()
    return embedding

In [None]:
# Reduzir dimensionalidade com UMAP
umap_reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2, random_state=42)
embeddings_umap = umap_reducer.fit_transform(embeddings)

# Reduzir dimensionalidade com T-SNE
tsne_reducer = TSNE(n_components=2, perplexity=30, random_state=42)
embeddings_tsne = tsne_reducer.fit_transform(embeddings)

AttributeError: module 'umap' has no attribute 'UMAP'