In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
d_emb = 1024

pad = torch.from_numpy(np.load('/workspace/eloise/sentemb/data/pad.npy')).to(device)

cos = nn.CosineSimilarity(dim=-1)
cossim = nn.CosineEmbeddingLoss()
mse = nn.MSELoss()

def criterion(output, target):
    
    mask = ~(target == pad).all(dim=-1) #[batch, seq]
    target = target[mask]
    output = output[mask]
    
    cossim_ = cossim(output, target, torch.full((1,), 1).to(device))
    mse_ = mse(output, target)
    return [cossim_, mse_]


def autoregr_infer(model, valid_tensors, n_parag):
    filler = 20-n_parag
    input_tensor = torch.stack([tensor[:n_parag] for tensor in valid_tensors])
    model.eval()
    with torch.no_grad():
        autoregr = input_tensor.to(device)
        for i in range(filler):
            #output = model(autoregr)[:, -1].unsqueeze(1)
            output = model.predict_next_sentence(autoregr)[:, -1].unsqueeze(1)
            autoregr = torch.cat((autoregr, output), dim=1)
    return autoregr[:, n_parag:]


def calculate_score(output, targets, filler):

    score_one_sum = 0
    score_sum_pad = torch.zeros(filler).to(device)
    pad_nbr_sum = torch.zeros(filler).to(device)

    for batch, target in enumerate(targets):
        out = output[batch][:len(target)].to(device)
        
        score = cos(out, target)

        score_one_sum += score.mean()
        score_sum_pad += F.pad(score, (0, filler - len(score)))
        pad_nbr_sum += F.pad(torch.ones(len(score)), (0, filler - len(score))).to(device)

    paragraphed_score = score_sum_pad/pad_nbr_sum
    paragraphed_score = [round(elem.item(), 2) for elem in paragraphed_score]
    final_score = score_one_sum.item()/len(targets)
    return final_score, paragraphed_score


sonarprompt, sonaroutput, _ = torch.load('data/test_sonarprompt_sonaroutput_jasperoutput.pth')

sonardata = []
for i in range(len(sonarprompt)):
    sonardata.append(torch.cat((sonarprompt[i], sonaroutput[i]), dim=0))



from lcm.models.base_lcm.archs import base_lcm_max_ray
from lcm.models.base_lcm.builder import BaseLCModelBuilder

config = {
    "model_dim":2048,
    "model_arch":[11, 4]
}

model = BaseLCModelBuilder(base_lcm_max_ray(config), device=device).build_model('lolo')
model.load_state_dict(torch.load('Base_LCM_0.53_0.5_2048_4_11.pth')[0])

seq = 20

  sonarprompt, sonaroutput, _ = torch.load('data/test_sonarprompt_sonaroutput_jasperoutput.pth')
  model.load_state_dict(torch.load('Base_LCM_0.53_0.5_2048_4_11.pth')[0])


SCORE

In [2]:
list_final_score = []

dataloop = sonardata[:]

for n_parag in range(1, seq):
    
    filler=seq-n_parag

    valid_tensors = [tensor for tensor in dataloop if tensor.size(0) > n_parag]

    output = autoregr_infer(model, valid_tensors, n_parag)
    
    targets = [tensor[n_parag:] for tensor in valid_tensors]
    
    final_score, paragraphed_score = calculate_score(output, targets, filler)
    
    list_final_score.append(final_score)
    
    print(str(round(final_score, 3))+ ' '+ n_parag*6*' '  +str(paragraphed_score))

round(np.average(list_final_score), 3)

0.53       [0.78, 0.6, 0.59, 0.54, 0.48, 0.45, 0.42, 0.44, 0.44, 0.41, 0.4, 0.4, 0.37, 0.38, 0.4, 0.38, 0.38, 0.37, 0.36]
0.513             [0.61, 0.6, 0.54, 0.48, 0.45, 0.43, 0.44, 0.45, 0.41, 0.4, 0.4, 0.37, 0.38, 0.41, 0.39, 0.37, 0.39, 0.38]
0.518                   [0.63, 0.57, 0.5, 0.47, 0.44, 0.45, 0.45, 0.41, 0.4, 0.41, 0.38, 0.39, 0.41, 0.38, 0.38, 0.39, 0.38]
0.519                         [0.59, 0.55, 0.51, 0.47, 0.47, 0.5, 0.43, 0.42, 0.44, 0.42, 0.4, 0.41, 0.39, 0.38, 0.39, 0.39]
0.504                               [0.59, 0.53, 0.5, 0.48, 0.51, 0.44, 0.43, 0.45, 0.43, 0.42, 0.43, 0.41, 0.4, 0.42, 0.42]
0.508                                     [0.61, 0.56, 0.51, 0.53, 0.46, 0.48, 0.49, 0.46, 0.44, 0.44, 0.41, 0.42, 0.42, 0.39]
0.509                                           [0.61, 0.55, 0.56, 0.47, 0.5, 0.5, 0.46, 0.46, 0.46, 0.41, 0.42, 0.43, 0.39]
0.51                                                 [0.6, 0.59, 0.49, 0.51, 0.51, 0.48, 0.47, 0.47, 0.41, 0.42, 0.44, 0.4]
0.5

0.533

NLP

In [10]:
size = 20
n_parag = 5

dataloop = sonardata[:size]
filler=seq-n_parag
valid_tensors = [tensor for tensor in dataloop if tensor.size(0) > n_parag]
    
output_autoregr = autoregr_infer(model, valid_tensors, n_parag)

from sonar.inference_pipelines.text import EmbeddingToTextModelPipeline
vec2text_model = EmbeddingToTextModelPipeline(decoder="text_sonar_basic_decoder", tokenizer="text_sonar_basic_decoder", device=device)

In [11]:
#1000 => 87mn
#100 => 7mn
#10 => 38s
text_autoregr = []
for data in output_autoregr:
    sample = vec2text_model.predict(data, target_lang="eng_Latn")
    text_autoregr.append(sample)

In [12]:
test_data = torch.load('../datasets/fineweb_random.pth')[-1000:][:size]
test_data = [sous_liste[:seq] for sous_liste in test_data]
test_data = [tensor for tensor in test_data if len(tensor) > n_parag]

for i in test_data:
    if len(i) < seq:
        pad_need = seq-len(i)
        for j in range(pad_need):
            i.append('End of text.')

  test_data = torch.load('../datasets/fineweb_random.pth')[-1000:][:size]


In [22]:
i=15
print('Prompt:', test_data[i][0].rstrip('\n'), '\n')
for j in range(seq-1):
    print('Targt:', test_data[i][j+1].rstrip('\n'))
    if j>=n_parag-1:
        print('Infer:', text_autoregr[i][j-(n_parag-1)].rstrip('\n'), '\n')

IndexError: list index out of range