# 0. Overview
Author: Darrin O'Brien, email: darrinobrien5@gmail.com

**Note: Not Needed. Reference if have time**
1. Fine-Tunes CLIP ViT-32 on the Stanford Cars Dataset (https://docs.pytorch.org/vision/main/generated/torchvision.datasets.StanfordCars.html). Original Paper: https://www.cv-foundation.org/openaccess/content_iccv_workshops_2013/W19/html/Krause_3D_Object_Representations_2013_ICCV_paper.html 
2. Evaluates the performance of the fine-tuned model.

## 1. Quick Installs for Essential Libraries

In [None]:
!pip install torch torchvision
!pip install fifty regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install pandas scipy
!pip install -U scikit-learn

### 1. Runpod Only Installs

In [None]:
!pip install --force-reinstall --no-cache-dir scipy # Only needed within runpod environment

## 2. Importing Libraries

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Subset
import clip
from tqdm import tqdm
from torchvision.datasets import StanfordCars
from sklearn.model_selection import train_test_split

## 3. Setting up Device and Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device) # https://github.com/openai/CLIP
clip_model = clip_model.float() # For fp-32 precision

## 4. Preparing Dataset

In [4]:
# Only needed if going off of Kaggle. However, there is no test labels so you can't verify learning unless you minimize the training data. 
'''
!pip install kagglehub
import os
import json
import kagglehub
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
import pandas as pd

path = "../../kaggle.json" # Locally
# path = "kaggle.json"
with open(path, "r") as f:
    kaggle_creds = json.load(f)

# Set environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_creds["username"]
os.environ["KAGGLE_KEY"] = kaggle_creds["key"]

path = kagglehub.dataset_download("eduardo4jesus/stanford-cars-dataset")

print("Path to dataset files:", path) # Then move this into the directory

load_train, load_test = loadmat(f"stanford-cars-dataset/car_devkit/devkit/cars_train_annos.mat"), loadmat(f"stanford-cars-dataset/car_devkit/devkit/cars_test_annos.mat")
annot_train, annot_test = load_train["annotations"][0], load_test["annotations"][0]

all_train = pd.DataFrame([
    {
        'fname': str(i['fname'][0]), # File Name
        'class': int(i["class"][0][0]) # ID of Class
    }
    for i in annot_train
])

test = pd.DataFrame([
    {
        'fname': str(i['fname'][0]), # File Name
        'class': int(i["class"][0][0]) # ID of Class
    }
    for i in annot_test
])

# train, val = train_test_split(all_train, test_size=0.2)
'''

'\n!pip install kagglehub\nimport os\nimport json\nimport kagglehub\nfrom scipy.io import loadmat\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\npath = "../../kaggle.json" # Locally\n# path = "kaggle.json"\nwith open(path, "r") as f:\n    kaggle_creds = json.load(f)\n\n# Set environment variables\nos.environ["KAGGLE_USERNAME"] = kaggle_creds["username"]\nos.environ["KAGGLE_KEY"] = kaggle_creds["key"]\n\npath = kagglehub.dataset_download("eduardo4jesus/stanford-cars-dataset")\n\nprint("Path to dataset files:", path) # Then move this into the directory\n\nload_train, load_test = loadmat(f"stanford-cars-dataset/car_devkit/devkit/cars_train_annos.mat"), loadmat(f"stanford-cars-dataset/car_devkit/devkit/cars_test_annos.mat")\nannot_train, annot_test = load_train["annotations"][0], load_test["annotations"][0]\n\nall_train = pd.DataFrame([\n    {\n        \'fname\': str(i[\'fname\'][0]), # File Name\n        \'class\': int(i["class"][0][0]) # ID of Class\n    }\

In [None]:
full_train = StanfordCars(root="./data", split="train", download=True)
test = StanfordCars(root="./data", split="test", download=True)

labels = [full_train[i][1] for i in range(len(full_train))]

train_indices, val_indices = train_test_split(
    [i for i in range(len(full_train))],
    test_size=0.2,
    stratify=labels,
    random_state=66,
)

train = Subset(full_train, train_indices)
val = Subset(full_train, val_indices)

def clip_collate_fn(batch):
    images = []
    labels = []
    for img, label in batch:
        img = preprocess(img)
        images.append(img)
        labels.append(label)
    
    images = torch.stack(images)
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        "pixel_values": images.to(device),
        "labels": labels.to(device)
    }

train_loader = DataLoader(train, batch_size=64, shuffle=True, num_workers=4, collate_fn=clip_collate_fn)
val_loader = DataLoader(val, batch_size=64, shuffle=False, num_workers=4, collate_fn=clip_collate_fn)
test_loader = DataLoader(test, batch_size=64, shuffle=False, num_workers=4, collate_fn=clip_collate_fn)

## 5. Fine-Tune Prep

In [None]:
class CLIPClassifier(nn.Module):
  def __init__(self, clip_model, num_classes=196):
    super().__init__()
    self.clip = clip_model
    self.classifier = nn.Linear(self.clip.visual.output_dim, num_classes)

  def forward(self, images):
    image_features = self.clip.encode_image(images)
    logits = self.classifier(image_features)
    return logits

model = CLIPClassifier(clip_model=clip_model).to(device)
model = model.float()

In [None]:
if device == "cpu":
  model = model.float()

optimizer = optim.Adam(model.parameters(), lr=1e-5)

criterion = nn.CrossEntropyLoss() 

EPOCHS = 15 # For fp-32
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader) * EPOCHS)

## 6. Fine-Tuning CLIP on Stanford Cars Dataset

In [None]:
best_val_loss = float('inf')
best_epoch = -1

for epoch in range(EPOCHS):
  print(f"Epoch {epoch+1}/{EPOCHS} - Best Val Loss: {best_val_loss:.4f} (Epoch {best_epoch})")

  model.train()
  total_train_loss = 0
  train_steps = 0

  for batch in tqdm(train_loader, desc="Training"):
    optimizer.zero_grad()

    images = batch["pixel_values"]
    labels = batch["labels"]

    logits = model(images)
    loss = criterion(logits, labels)

    loss.backward()
    optimizer.step()

    total_train_loss += loss.item()
    train_steps += 1

  avg_train_loss = total_train_loss / train_steps

  # Validation
  model.eval()
  correct = 0
  total = 0
  total_val_loss = 0
  val_steps = 0

  with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
      images = batch["pixel_values"]
      labels = batch["labels"]

      logits = model(images)
      loss = criterion(logits, labels)

      preds = torch.argmax(logits, dim=1)
      correct += (preds == labels).sum().item()
      total += labels.size(0)

      total_val_loss += loss.item()
      val_steps += 1

  avg_val_loss = total_val_loss / val_steps
  val_acc = correct / total

  print(f"[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f} | Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    best_epoch = epoch
    torch.save(model.state_dict(), "best_clip_cars.pt")

  scheduler.step()

## 7. Testing Fine-Tuned Model

In [None]:
base_CLIP, _ = clip.load("ViT-B/32", device=device)
base_CLIP = base_CLIP.float() # fp-32
model = CLIPClassifier(clip_model=base_CLIP).to(device)

best_CLIP, _ = clip.load("ViT-B/32", device=device)
best_CLIP = best_CLIP.float() # fp-32
best_CLIP_Cars = CLIPClassifier(clip_model=best_CLIP).to(device)
best_CLIP_Cars.load_state_dict(torch.load("best_clip_cars.pt", map_location=device)) # map_location tells where to place the model's weights in memory

model.eval()
best_CLIP_Cars.eval()

total_test_loss_base = 0
total_base = 0
total_test_loss_best = 0
total_best = 0

correct_base = 0
correct_best = 0
total_samples = 0

with torch.no_grad():
  for batch in tqdm(test_loader, desc="Testing"):
    images = batch["pixel_values"]
    labels = batch["labels"]
    total_samples += labels.size(0)

    # Base model
    logits_base = model(images)
    loss_base = criterion(logits_base, labels)
    total_test_loss_base += loss_base.item()
    total_base += 1

    # Best model
    logits_best = best_CLIP_Cars(images)
    loss_best = criterion(logits_best, labels)
    total_test_loss_best += loss_best.item()
    total_best += 1

    # Classification Accuracy
    pred_base = logits_base.argmax(dim=1)
    pred_best = logits_best.argmax(dim=1)

    correct_base += (pred_base == labels).sum().item()
    correct_best += (pred_best == labels).sum().item()

avg_base_loss = total_test_loss_base / total_base
avg_best_loss = total_test_loss_best / total_best

accuracy_base = correct_base / total_samples
accuracy_best = correct_best / total_samples
print(f"\nAverage base loss: {avg_base_loss:.4f}, Base Accuracy: {accuracy_base:.4f}")
print(f"Average best loss: {avg_best_loss:.4f}, Best Accuracy: {accuracy_best:.4f}")