In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
import numpy as np
from tqdm import tqdm
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline

t2vec_model = TextToEmbeddingModelPipeline(encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder", device=device)

#you need a pad token, I chose 'End of text.'
pad = t2vec_model.predict(['End of text.'], source_lang="eng_Latn")[0]
np.save('/LCM/data/pad.npy', pad.numpy())

We transform the raw data to pytorch tensor:

In [None]:
#fineweb_random is my text data file, you should use your own text dataset
dataset = torch.load('fineweb_random.pth')

data_vect = []
for sample in tqdm(dataset):
    data_vect.append(t2vec_model.predict(sample, source_lang="eng_Latn"))

np.save('/LCM/data/dataset_vect.npy', data_vect.numpy())

We pad and truncate the dataset:

In [None]:
def pad_tensor(tensor, max_len=20):
    seq = tensor.size(0)
    if seq < max_len:
        padding_needed = max_len - seq
        padding = pad.expand(padding_needed, -1).to(device)
        tensor = torch.cat([tensor, padding], dim=0)
    return tensor[:max_len]


#I divise my dataset in 10 files for memory
for i in range(10):
    beg = i*100000
    end = (i+1)*100000
    if i==9: #we save 2000 samples for validation and test set
        end = 998000
    data = dataset[beg:end]
    tensor_list_padded = torch.stack([pad_tensor(tensor.to(device)) for tensor in data])

    np.save('/LCM/data/'+str(i)+'00k.npy', tensor_list_padded.numpy())

The first file we will load during training, it contains train samples and the validation set:

In [None]:
val = dataset[-2000:-1000]
np.savez('/data/100k_1k_0.npz', train=np.load('/LCM/data/100k.npy'), val=val)

Test file creation

In [None]:
import torch
import numpy as np
dataset = torch.from_numpy(np.load('/LCM/data/dataset_vect.npy'))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
t2vec_model = TextToEmbeddingModelPipeline(encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder", device=device)

test_sonar = dataset[-1000:]

test_sonar = [sous_liste[:20] for sous_liste in test_sonar]

prompt = []
output = []

for sample in test_sonar:
    prompt.append(sample[0])
    output.append(sample[1:])
    
prompt = torch.stack(prompt).unsqueeze(1)

In [None]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
dataset = torch.from_numpy(np.load('/LCM/data/dataset_vect.npy'))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

jasper = SentenceTransformer("infgrad/jasper_en_vision_language_v1",
    trust_remote_code=True,
    device=device,
    model_kwargs={"torch_dtype":  torch.bfloat16 if device == torch.device('cuda') else torch.float32},
)
jasper.max_seq_length = 1024

data_jasper = dataset[-1000:][:10]

data_jasper = [sous_liste[1:20] for sous_liste in data_jasper]

test_jasper = []
for sample in data_jasper:
    enc = jasper.encode(sample)
    test_jasper.append(torch.from_numpy(enc).to(device))

In [None]:
torch.save([prompt, output, test_jasper], '/LCM/data/test_sonarprompt_sonaroutput_jasperoutput.pth')