In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch.nn as nn
import torch
from datetime import datetime
import pandas as pd
from transformers import CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel, AutoTokenizer
import numpy as np

from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader, random_split, Subset


from pathlib import Path
from PIL import Image


## IMPORT

#### DATA

In [None]:
!unzip "/content/drive/MyDrive/tinyvlm/data/image-description-marketplace-data.zip" -d "/content/flip_data_vlm"

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_1163.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_1166.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_Promt_1243.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_Quadro_Express_1438.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_Quadro_Infix_1542.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Хаб_Universal_RS050_5850.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Цветной_чехол_на_IPhone_14_Pro_Max_с_функцией_MagS_881.jpg  
  inflating: /content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_4/images/Цветной_чехол_на_IPhone_14_Pro_Max_с_функцией_MagS_882.jpg  


In [None]:
all_products = pd.read_csv("/content/drive/MyDrive/tinyvlm/data/all_products_combined_translated.csv")
all_products = all_products.dropna()

In [None]:
def is_valid_image_path(path):
    path_obj = Path(path)
    return path_obj.is_file() and not path_obj.is_dir()

def convert_to_colab_path(local_path):
    if local_path.startswith("/root/flip/data/"):
        return local_path.replace("/root/flip/data/", "/content/flip_data_vlm/flip_data_vlm/flip_data_vlm/")
    elif local_path.startswith("/flo_images/"):
        return local_path.replace("/flo_images/", "/content/flip_data_vlm/flip_data_vlm/flip_data_vlm/flo_images/")
    else:
        return ""  # or local_path

# Convert local paths to Colab-compatible ones
all_products["colab_image_path"] = (
    all_products["local_image_path"]
    .fillna("")
    .astype(str)
    .apply(convert_to_colab_path)
)

# Keep only rows with valid image paths
all_products = all_products[
    all_products["colab_image_path"].apply(is_valid_image_path)
]

In [None]:
image_title_pairs = list(all_products[['colab_image_path','title_translated']].sample(frac=1, random_state=42, replace=False).itertuples(index=False, name=None))
len(image_title_pairs)

28447

In [None]:
class ImageTitleDataset(Dataset):

    def __init__(self, data):
        """
        data: list of tuples (image_path, text)
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image_path, text = self.data[index]
        return image_path, text

In [None]:
dataset = ImageTitleDataset(image_title_pairs)

In [None]:
dataset[72]

('/content/flip_data_vlm/flip_data_vlm/flip_data_vlm/category_1/images/Протеиновые_брауни_в_шоколаде_со_вкусом_вишни_303.jpg',
 'Protein brows in chocolate with cherry taste')

In [None]:
general_dataloader = DataLoader(dataset, batch_size=512, shuffle=False)

#### CLIP

In [None]:
model = CLIPModel.from_pretrained("/content/drive/MyDrive/tinyvlm/models/openai/clip_vit_large_patch14_finetuned").to('cuda')
processor = CLIPProcessor.from_pretrained("/content/drive/MyDrive/tinyvlm/models/openai/clip_vit_large_patch14_finetuned")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## CLIP INFERENCE

In [16]:

data = []

for batch_idx, (image_paths, texts) in enumerate(general_dataloader):

    # Load and process images
    images_processed = processor(
        images=[Image.open(p).convert("RGB") for p in image_paths],
        return_tensors="pt",
        padding=True
    ).to("cuda")

    # Process text
    texts_processed = processor(
        text=texts,
        return_tensors="pt",
        padding=True
    ).to("cuda")

    with torch.no_grad():
        image_features = model.get_image_features(**images_processed)
        text_features = model.get_text_features(**texts_processed)

    # Store per-sample entries
    for i, path in enumerate(image_paths):
        data.append({
            "image_path": path,
            "text": texts[i],                     # <-- store original text
            "image_embedding": image_features[i].cpu(),
            "text_embedding": text_features[i].cpu(),
        })

    print(f"Processed batch {batch_idx}/{len(general_dataloader)}")







Processed batch 0/56
Processed batch 1/56
Processed batch 2/56
Processed batch 3/56
Processed batch 4/56
Processed batch 5/56
Processed batch 6/56
Processed batch 7/56
Processed batch 8/56
Processed batch 9/56
Processed batch 10/56
Processed batch 11/56
Processed batch 12/56
Processed batch 13/56
Processed batch 14/56
Processed batch 15/56
Processed batch 16/56
Processed batch 17/56
Processed batch 18/56
Processed batch 19/56
Processed batch 20/56
Processed batch 21/56
Processed batch 22/56
Processed batch 23/56
Processed batch 24/56
Processed batch 25/56
Processed batch 26/56
Processed batch 27/56
Processed batch 28/56
Processed batch 29/56
Processed batch 30/56
Processed batch 31/56
Processed batch 32/56
Processed batch 33/56
Processed batch 34/56
Processed batch 35/56
Processed batch 36/56
Processed batch 37/56
Processed batch 38/56
Processed batch 39/56
Processed batch 40/56
Processed batch 41/56
Processed batch 42/56
Processed batch 43/56
Processed batch 44/56
Processed batch 45/5

In [17]:
torch.save(data, "/content/drive/MyDrive/tinyvlm/data/image_paths_with_clip_embeddings_large_ft.pt")

##### RELOAD DATA

In [18]:
class ImageTitleWithClipDataset(Dataset):
    def __init__(self, data):
        """
        data: list of dicts with keys:
          - image_path
          - text
          - image_embedding (CLIP)
          - text_embedding (CLIP)
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        return (
            item["image_path"],         # PIL image will be loaded in training loop
            item["text"],               # text
            item["image_embedding"],    # CLIP image embedding (tensor)
            item["text_embedding"]      # CLIP text embedding (tensor)
        )

In [19]:
# Load the data from the .pt file
loaded_data = torch.load("/content/drive/MyDrive/tinyvlm/data/image_paths_with_clip_embeddings_large_ft.pt")

# Initialize the dataset with the loaded data
dataset = ImageTitleWithClipDataset(loaded_data)

In [24]:
dataset[42][2].shape

torch.Size([768])

##### SAVE INDICIES

In [20]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=generator)

# Save indices as a .pt file
torch.save({
    "train_indices": train_dataset.indices,
    "val_indices": val_dataset.indices
}, "/content/drive/MyDrive/tinyvlm/data/train_val_indices_large_ft.pt")