In [1]:
import torch
import pandas as pd
from datasets import load_dataset 

dataset = load_dataset("ybelkada/football-dataset", split="train")

In [3]:
from torch.utils.data import Dataset, DataLoader

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], text=item["text"], padding="max_length", max_length=75, return_tensors="pt")
        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

In [4]:
from transformers import AutoProcessor, BlipForConditionalGeneration, Blip2ForConditionalGeneration

# processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", device_map={"": 0}).cuda()
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", device_map={"": 0}).cuda()
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16).cuda()
# model2 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map={"": 0}).cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
print(f'Memory Allocated after instantiating model: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')

Memory Allocated after instantiating model: 22.66 GB


In [9]:
train_dataset = ImageCaptioningDataset(dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)

In [10]:
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=10e-4)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()

for epoch in range(50):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=input_ids)    
    loss = outputs.loss
    loss.backward()

    print("Loss:", loss.item())

    # ADDRESS GRADIENTS EXPLOSION  
    # scale_factor = 0.001
    # for param in model.parameters():
    #     param.grad *= scale_factor

    # Clamp Gradients
    # for param in model.parameters():
    #     param.grad = torch.clamp(param.grad, min=.001, max=.99)
        
    # clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    optimizer.zero_grad()

Epoch: 0
Loss: 0.0251312255859375
Loss: 0.08270263671875
Loss: 0.05401611328125
Loss: 0.045013427734375
Loss: 0.0472412109375
Loss: 0.0477294921875
Epoch: 1
Loss: 0.022186279296875
Loss: 0.034271240234375
Loss: 0.046417236328125
Loss: 0.039825439453125
Loss: 0.04345703125
Loss: 0.0352783203125
Epoch: 2
Loss: 0.0207672119140625
Loss: 0.028076171875
Loss: 0.03887939453125
Loss: 0.037933349609375
Loss: 0.031829833984375
Loss: 0.047576904296875
Epoch: 3
Loss: 0.0462646484375
Loss: 0.03204345703125
Loss: 0.0261383056640625
Loss: 0.03436279296875
Loss: 0.044769287109375
Loss: 0.028656005859375
Epoch: 4
Loss: 0.0228118896484375
Loss: 0.0272674560546875
Loss: 0.039031982421875
Loss: 0.0362548828125


KeyboardInterrupt: 

In [None]:
my_csv = '../datasets/full_ds.csv'
dataset = load_dataset("csv", data_files=my_csv)

# df = pd.read_csv('../datasets/full_ds.csv')
# df.head()
dataset

# PLOTTING

In [None]:
import matplotlib.pyplot as plt
import torch

def plot_weights_histogram(model, layer_names=None):
    for name, param in model.named_parameters():
        # if layer_names is None or any(layer_name in name for layer_name in layer_names):
        plt.figure(figsize=(8, 6))
        plt.hist(param.data.flatten().cpu().numpy(), bins=50, color='blue', alpha=0.7)
        plt.title(f'Weight Distribution - {name}')
        plt.xlabel('Weight Value')
        plt.ylabel('Frequency')
        plt.show()

plot_weights_histogram(model, layer_names=['fc', 'conv'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import torch

def plot_weights_outliers(model, layer_names=None, threshold=3):
    outlier_data = []

    for name, param in model.named_parameters():
        if layer_names is None or any(layer_name in name for layer_name in layer_names):
            # Flatten and convert to numpy array
            weights = param.data.flatten().cpu().numpy()

            # Detect outliers using IQR method
            q25, q75 = torch.percentile(param.data, [25, 75])
            iqr = q75 - q25
            lower_bound = q25 - threshold * iqr
            upper_bound = q75 + threshold * iqr

            # Identify outliers
            outliers = weights[(weights < lower_bound) | (weights > upper_bound)]

            # Store layer information and outliers
            layer_info = {'Layer': name, 'Outliers': outliers}
            outlier_data.append(layer_info)

    # Plot outliers using box plots
    plt.figure(figsize=(12, 8))
    sns.boxplot(x='Layer', y='Outliers', data=outlier_data)
    plt.title('Outliers in Weights Across Layers')
    plt.xticks(rotation=90)
    plt.show()

# Example usage:
# Assuming 'your_model' is an instance of your neural network
plot_weights_outliers(your_model, layer_names=['fc', 'conv'], threshold=3)