In [None]:
from google.colab import drive
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
#drive.mount('/content/drive')
folder = '/content/drive/MyDrive/conserta-avioes'

## Métricas de Teste

In [None]:
dataset_teste = pd.read_csv(folder+"/dataset_teste.csv")
dataset_teste = dataset_teste.sort_values(by='Grupo')
dataset_teste

Unnamed: 0,Grupo,Frase,Tipo
0,6,RIGHT ENGINE #4 AIR BAFFLE IS CRACKED,orig
163,6,Found a crack in the left engine’s #3 air baff...,var
165,6,Left engine #4 air baffle is cracked.,var
164,6,The air baffle on engine #2 (right side) shows...,var
162,6,Cracked air baffle detected on the right engin...,var
...,...,...,...
809,1621,Cylinder No.2 on the right engine showed a 40/...,var
161,1621,RIGHT ENGINE #4 CYLINDER COMPRESSION 40/80.,orig
806,1621,Compression on the right engine’s No.4 cylinde...,var
808,1621,Left engine cylinder 4 compression was recorde...,var


In [None]:
corpus = dataset_teste.loc[dataset_teste['Tipo']=='orig','Frase'].to_list()
corpus[:20]

['RIGHT ENGINE #4 AIR BAFFLE IS CRACKED',
 'RIGHT ENGINE FORWARD ALTERNATOR ATTACH BOLT LOOSE.',
 'AFTER LANDING, A/C IDLE @ 970 RPM.',
 'TOP FRONT RIGHT BAFFLE, BAFFLE SEAL RIVET PULLED THROUGH.',
 'ROUGH RUNNING ENGINE ON START. ENGINE RAN SMOOTHER AS IT WAR',
 'CYLINDER HEAD TEMPERATURE NEEDLE BOUNCES & HAD ENGINE RUN ROUGH MOMENTARILY.',
 'LACING CORD LOOSE ON SCAT TUBING + IGNITION LEAD TO FRAME, RIGHT SI',
 'RIGHT SIDE BACK BAFFLE IS CRACKED & BRACKET RIVETS BROKEN.',
 'SPARK PLUG BAFFLE PLUG IS WORN.',
 '4TH STAGE NOZZLE HAS SEVERAL CRACKS IN IT.',
 '#1 FORWARD BAFFLE IS WORN THROUGH.',
 'ENGINE RUNS ROUGH ON START. MAINTENANCE HAND SPUN PROP & REPORTED 1 CYLINDER',
 'RAN ENGINE START CKLIST 3 TIMES, TRIED COLD START CKLIST 2 T',
 'BACK RIGHT BAFFLE BACK BRACKET RIVET HEAD IS SHEARED OFF.',
 'AFT BAFFLE BRACKET RIVETS PULLING THROUGH.',
 'FRONT BAFFLE BY INTAKE HEAT, AROUND MUFFLER SHROUD, IS CRACK',
 'RIGHT FRONT BAFFLE SEAL RIVET IS BROKEN.',
 'CYLINDER #2 INTAKE PUSH ROD TUBE

In [None]:
queries = dataset_teste.loc[dataset_teste['Tipo']=='var','Frase'].apply(lambda x: x.upper().strip()).to_list()
queries[:20]

['FOUND A CRACK IN THE LEFT ENGINE’S #3 AIR BAFFLE DURING INSPECTION.',
 'LEFT ENGINE #4 AIR BAFFLE IS CRACKED.',
 'THE AIR BAFFLE ON ENGINE #2 (RIGHT SIDE) SHOWS A CRACK.',
 'CRACKED AIR BAFFLE DETECTED ON THE RIGHT ENGINE, #4.',
 'NO. 1 ENGINE FORWARD ALTERNATOR ATTACH BOLT IS LOOSE.',
 'ON ENGINE #2 THE FORWARD BOLT THAT SECURES THE ALTERNATOR IS LOOSE.',
 'FOUND THE ALTERNATOR FORWARD MOUNTING BOLT LOOSE ON THE LEFT ENGINE.',
 'THE FORWARD ALTERNATOR ATTACH BOLT ON THE RIGHT ENGINE WAS FOUND LOOSE.',
 'POST-LANDING, ENGINE IDLE SPEED WAS RECORDED AT APPROXIMATELY 980 RPM.',
 'AFTER LANDING, A/C IDLE AT 970 RPM.',
 'UPON TOUCHDOWN THE AIRPLANE REMAINED IDLING AROUND 960 RPM.',
 'AFTER ROLLOUT THE A/C SETTLED TO AN IDLE NEAR 975 R/MIN.',
 'ON THE TOP-FRONT LEFT BAFFLE, THE SEAL RIVET WORKED ITS WAY THROUGH THE MATERIAL.',
 'THE TOP FRONT RIGHT BAFFLE SEAL RIVET HAS PULLED THROUGH THE BAFFLE.',
 'INSPECTION REVEALED THE TOP FRONT RIGHT BAFFLE RIVET TORN THROUGH THE SEAL.',
 'FOUND THE

Conferindo as quantidades

In [None]:
len(corpus)

162

In [None]:
len(queries)

648

In [None]:
162*4

648

Gerando o "gabarito", com as posições das frases que devem ser encontradas no corpus a partir das queries.

In [None]:
gabarito = np.repeat(np.identity(162),4,axis=0)
gabarito

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
gabarito.shape

(648, 162)

In [None]:
def recall_at_k(model,k):
    with torch.no_grad():
        corpus_emb = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
        query_emb = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
        sims = model.similarity(query_emb,corpus_emb)
    sims_np = sims.cpu().numpy()
    ranks = np.argsort(np.argsort(-sims_np, axis=1),axis=1)
    topk = ranks < k
    encontradas = np.logical_and(topk,gabarito).astype(int).sum()
    return encontradas/648

## Benchmark

In [None]:
lista_modelos = [
    "all-mpnet-base-v2",
    "all-distilroberta-v1",
    "all-MiniLM-L6-v2",
    "all-MiniLM-L12-v2",
    "multi-qa-mpnet-base-dot-v1",
    "multi-qa-distilbert-dot-v1",
    "multi-qa-MiniLM-L6-dot-v1",
    "multi-qa-mpnet-base-cos-v1",
    "multi-qa-distilbert-cos-v1",
    "multi-qa-MiniLM-L6-cos-v1",
]

In [None]:
modelos = {modelo:SentenceTransformer(modelo) for modelo in lista_modelos}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Não estou mais usando a 'Distância', estou calculando tudo com a similarity() do modelo.

In [None]:
benchmark = pd.DataFrame(columns=['Modelo','Distância','Recall@1','Recall@3','Recall@7'])
benchmark['Modelo'] = lista_modelos[:4]+lista_modelos
benchmark['Distância'] = ['cos']*4+['dot']*7+['cos']*3
benchmark

Unnamed: 0,Modelo,Distância,Recall@1,Recall@3,Recall@7
0,all-mpnet-base-v2,cos,,,
1,all-distilroberta-v1,cos,,,
2,all-MiniLM-L6-v2,cos,,,
3,all-MiniLM-L12-v2,cos,,,
4,all-mpnet-base-v2,dot,,,
5,all-distilroberta-v1,dot,,,
6,all-MiniLM-L6-v2,dot,,,
7,all-MiniLM-L12-v2,dot,,,
8,multi-qa-mpnet-base-dot-v1,dot,,,
9,multi-qa-distilbert-dot-v1,dot,,,


In [None]:
for i in tqdm(range(benchmark.shape[0])):
  modelo = benchmark.iloc[i,0]
  distancia = benchmark.iloc[i,1]
  print(f"Modelo: {modelo}")
  #print(f"Distância: {distancia}")
  resultados = []
  for k in [1,3,7]:
    #resultado = recall_at_k(modelos[modelo],k,distancia)
    modelo_pronto = modelos[modelo]
    modelo_pronto.eval()
    resultado = recall_at_k(modelo_pronto,k)
    print(f"Recall@{k}: {resultado}")
    resultados.append(resultado)
  benchmark.iloc[i,2:] = resultados

  0%|          | 0/14 [00:00<?, ?it/s]

Modelo: all-mpnet-base-v2
Recall@1: 0.8657407407407407
Recall@3: 0.9552469135802469


  7%|▋         | 1/14 [00:03<00:47,  3.64s/it]

Recall@7: 0.9830246913580247
Modelo: all-distilroberta-v1
Recall@1: 0.8256172839506173
Recall@3: 0.9058641975308642


 14%|█▍        | 2/14 [00:05<00:33,  2.77s/it]

Recall@7: 0.9429012345679012
Modelo: all-MiniLM-L6-v2
Recall@1: 0.9012345679012346
Recall@3: 0.9675925925925926


 21%|██▏       | 3/14 [00:06<00:20,  1.82s/it]

Recall@7: 0.9861111111111112
Modelo: all-MiniLM-L12-v2
Recall@1: 0.9135802469135802
Recall@3: 0.9768518518518519


 29%|██▊       | 4/14 [00:07<00:15,  1.50s/it]

Recall@7: 0.9938271604938271
Modelo: all-mpnet-base-v2
Recall@1: 0.8657407407407407
Recall@3: 0.9552469135802469


 36%|███▌      | 5/14 [00:10<00:17,  1.96s/it]

Recall@7: 0.9830246913580247
Modelo: all-distilroberta-v1
Recall@1: 0.8256172839506173
Recall@3: 0.9058641975308642


 43%|████▎     | 6/14 [00:12<00:15,  1.98s/it]

Recall@7: 0.9429012345679012
Modelo: all-MiniLM-L6-v2
Recall@1: 0.9012345679012346
Recall@3: 0.9675925925925926


 50%|█████     | 7/14 [00:12<00:10,  1.56s/it]

Recall@7: 0.9861111111111112
Modelo: all-MiniLM-L12-v2
Recall@1: 0.9135802469135802
Recall@3: 0.9768518518518519


 57%|█████▋    | 8/14 [00:14<00:08,  1.39s/it]

Recall@7: 0.9938271604938271
Modelo: multi-qa-mpnet-base-dot-v1
Recall@1: 0.9197530864197531
Recall@3: 0.9629629629629629


 64%|██████▍   | 9/14 [00:16<00:09,  1.84s/it]

Recall@7: 0.9845679012345679
Modelo: multi-qa-distilbert-dot-v1
Recall@1: 0.8904320987654321
Recall@3: 0.9660493827160493


 71%|███████▏  | 10/14 [00:18<00:06,  1.70s/it]

Recall@7: 0.9922839506172839
Modelo: multi-qa-MiniLM-L6-dot-v1
Recall@1: 0.8919753086419753
Recall@3: 0.9614197530864198


 79%|███████▊  | 11/14 [00:18<00:04,  1.38s/it]

Recall@7: 0.9845679012345679
Modelo: multi-qa-mpnet-base-cos-v1
Recall@1: 0.9027777777777778
Recall@3: 0.9614197530864198


 86%|████████▌ | 12/14 [00:21<00:03,  1.82s/it]

Recall@7: 0.9861111111111112
Modelo: multi-qa-distilbert-cos-v1
Recall@1: 0.9074074074074074
Recall@3: 0.9645061728395061


 93%|█████████▎| 13/14 [00:23<00:01,  1.69s/it]

Recall@7: 0.9938271604938271
Modelo: multi-qa-MiniLM-L6-cos-v1
Recall@1: 0.9012345679012346
Recall@3: 0.9598765432098766


100%|██████████| 14/14 [00:23<00:00,  1.70s/it]

Recall@7: 0.9876543209876543





In [None]:
benchmark

Unnamed: 0,Modelo,Distância,Recall@1,Recall@3,Recall@7
0,all-mpnet-base-v2,cos,0.865741,0.955247,0.983025
1,all-distilroberta-v1,cos,0.825617,0.905864,0.942901
2,all-MiniLM-L6-v2,cos,0.901235,0.967593,0.986111
3,all-MiniLM-L12-v2,cos,0.91358,0.976852,0.993827
4,all-mpnet-base-v2,dot,0.865741,0.955247,0.983025
5,all-distilroberta-v1,dot,0.825617,0.905864,0.942901
6,all-MiniLM-L6-v2,dot,0.901235,0.967593,0.986111
7,all-MiniLM-L12-v2,dot,0.91358,0.976852,0.993827
8,multi-qa-mpnet-base-dot-v1,dot,0.919753,0.962963,0.984568
9,multi-qa-distilbert-dot-v1,dot,0.890432,0.966049,0.992284


In [None]:
benchmark.to_csv(folder+"/benchmark_inicial_corrigido.csv",index=False)