In [None]:
!pip install -q timm

import os
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
from torch.cuda.amp import autocast, GradScaler
from scipy.stats import pearsonr, spearmanr
from collections import defaultdict

# ─── 2. Mount Google Drive 
from google.colab import drive
drive.mount('/content/drive')

# ─── 3. Paths & CSV Loading
DRIVE_ROOT = Path('/content/drive/MyDrive')
MODEL_DIR  = DRIVE_ROOT / 'models'
MODEL_DIR.mkdir(exist_ok=True)


train_csv = DRIVE_ROOT / '/content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/MOS_Files/train_df_dynamic.csv'
val_csv   = DRIVE_ROOT / '/content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/MOS_Files/val_df_dynamic.csv'

train_df = pd.read_csv(train_csv)
val_df   = pd.read_csv(val_csv)

print('Train columns:', train_df.columns.tolist())
print('Val   columns:',   val_df.columns.tolist())

# ─── 4. Transforms & Dataset 
IMG_SIZE    = 224
BATCH_SIZE  = 128
NUM_WORKERS = 8

train_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485,0.456,0.406),
                         std=(0.229,0.224,0.225)),
])

val_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485,0.456,0.406),
                         std=(0.229,0.224,0.225)),
])

class FrameDataset(Dataset):
    def __init__(self, df, transform):
        assert all(col in df.columns for col in ['Video','Frame','MOS']), \
            "CSV must contain 'Video', 'Frame', and 'MOS' columns"
        self.recs = df.to_dict('records')
        self.tf   = transform

    def __len__(self):
        return len(self.recs)

    def __getitem__(self, idx):
        r   = self.recs[idx]
        img = Image.open(r['Frame']).convert('RGB')
        x   = self.tf(img)
        y   = torch.tensor(r['MOS'], dtype=torch.float32)
        vid = r['Video']
        return x, y, vid

train_ds = FrameDataset(train_df, train_tf)
val_ds   = FrameDataset(val_df,   val_tf)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=NUM_WORKERS, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=NUM_WORKERS, pin_memory=True)

# ─── 5. Model & Optimizer Setup 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = timm.create_model('vit_base_patch16_224', pretrained=True)
in_feat = model.head.in_features
model.head = nn.Sequential(
    nn.LayerNorm(in_feat),
    nn.Linear(in_feat, 512),
    nn.GELU(),
    nn.Dropout(0.3),
    nn.Linear(512, 1)
)
model.to(device)

# Separate param‐groups for freeze → unfreeze
head_params     = list(model.head.parameters())
backbone_params = [p for n,p in model.named_parameters() if not n.startswith('head.')]

optimizer = optim.AdamW([
    {'params': head_params,     'lr': 3e-5},
    {'params': backbone_params, 'lr': 0    }
], weight_decay=1e-2)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
scaler    = GradScaler()
criterion = nn.MSELoss()

MODEL_PATH = MODEL_DIR / '/content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth'

# ─── 6. Video‐Level Evaluation 
def evaluate_video_level(loader):
    model.eval()
    preds_by_vid, trues_by_vid = defaultdict(list), defaultdict(list)

    with torch.no_grad():
        for xb, yb, vids in loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb).squeeze(1).cpu().numpy()
            true = yb.cpu().numpy()
            for vid, p, t in zip(vids, out, true):
                preds_by_vid[vid].append(p)
                trues_by_vid[vid].append(t)

    vp = np.array([np.mean(preds_by_vid[vid]) for vid in preds_by_vid])
    vt = np.array([np.mean(trues_by_vid[vid]) for vid in trues_by_vid])

    rmse = float(np.sqrt(((vp-vt)**2).mean()))
    ss_res = ((vt-vp)**2).sum()
    ss_tot = ((vt-vt.mean())**2).sum()
    r2     = 1 - ss_res/ss_tot
    pearson_corr, _  = pearsonr(vt, vp)
    spearman_corr, _ = spearmanr(vt, vp)
    return rmse, r2, pearson_corr, spearman_corr

# ─── 7. Training Loop with Early Stopping 
best_r2, wait, patience = -1e9, 0, 5
EPOCHS = 20

for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0

    for xb, yb, _ in tqdm(train_dl, desc=f"Epoch {epoch}/{EPOCHS}"):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        with autocast():
            preds = model(xb).squeeze(1)
            loss  = criterion(preds, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()
    train_rmse = np.sqrt(running_loss / len(train_ds))
    val_rmse, val_r2, val_p, val_s = evaluate_video_level(val_dl)

    print(
        f"\nEpoch {epoch:02d} | "
        f"Train RMSE: {train_rmse:.4f} | "
        f"Val RMSE:   {val_rmse:.4f} | "
        f"Val R²:     {val_r2:.4f} | "
        f"Pearson:    {val_p:.4f} | "
        f"Spearman:   {val_s:.4f}"
    )

    # Gradual unfreeze
    if epoch == 4:
        optimizer.param_groups[1]['lr'] = 3e-6

    # Early stopping on video‐level R²
    if val_r2 > best_r2 + 1e-4:
        best_r2, wait = val_r2, 0
        torch.save(model.state_dict(), str(MODEL_PATH))
        print(f"  ➤ New best R²={best_r2:.4f}; saved model to {MODEL_PATH}")
    else:
        wait += 1
        if wait >= patience:
            print(f"\nNo improvement in {patience} epochs. Stopping early.")
            break

print(f"\nTraining complete. Best Val R² = {best_r2:.4f}")
print(f"Best model path: {MODEL_PATH}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train columns: ['Video', 'Frame', 'MOS']
Val   columns: ['Video', 'Frame', 'MOS']


  scaler    = GradScaler()
  with autocast():
Epoch 1/20: 100%|██████████| 62/62 [02:36<00:00,  2.52s/it]



Epoch 01 | Train RMSE: 3.8242 | Val RMSE:   2.5730 | Val R²:     -5.2608 | Pearson:    0.0577 | Spearman:   0.0112
  ➤ New best R²=-5.2608; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 2/20: 100%|██████████| 62/62 [00:18<00:00,  3.44it/s]



Epoch 02 | Train RMSE: 1.9502 | Val RMSE:   1.3409 | Val R²:     -0.7004 | Pearson:    0.1922 | Spearman:   0.1412
  ➤ New best R²=-0.7004; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 3/20: 100%|██████████| 62/62 [00:21<00:00,  2.93it/s]



Epoch 03 | Train RMSE: 1.3247 | Val RMSE:   1.0309 | Val R²:     -0.0050 | Pearson:    0.4307 | Spearman:   0.3359
  ➤ New best R²=-0.0050; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 4/20: 100%|██████████| 62/62 [00:21<00:00,  2.89it/s]



Epoch 04 | Train RMSE: 1.0922 | Val RMSE:   0.8734 | Val R²:     0.2787 | Pearson:    0.6040 | Spearman:   0.4295
  ➤ New best R²=0.2787; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 5/20: 100%|██████████| 62/62 [00:21<00:00,  2.90it/s]



Epoch 05 | Train RMSE: 0.6291 | Val RMSE:   0.3201 | Val R²:     0.9031 | Pearson:    0.9574 | Spearman:   0.4983
  ➤ New best R²=0.9031; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 6/20: 100%|██████████| 62/62 [00:25<00:00,  2.47it/s]



Epoch 06 | Train RMSE: 0.4403 | Val RMSE:   0.2737 | Val R²:     0.9291 | Pearson:    0.9727 | Spearman:   0.5122
  ➤ New best R²=0.9291; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 7/20: 100%|██████████| 62/62 [00:24<00:00,  2.52it/s]



Epoch 07 | Train RMSE: 0.3966 | Val RMSE:   0.2410 | Val R²:     0.9451 | Pearson:    0.9778 | Spearman:   0.5108
  ➤ New best R²=0.9451; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 8/20: 100%|██████████| 62/62 [00:24<00:00,  2.55it/s]



Epoch 08 | Train RMSE: 0.3841 | Val RMSE:   0.2350 | Val R²:     0.9478 | Pearson:    0.9800 | Spearman:   0.5155
  ➤ New best R²=0.9478; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 9/20: 100%|██████████| 62/62 [00:25<00:00,  2.47it/s]



Epoch 09 | Train RMSE: 0.3671 | Val RMSE:   0.2175 | Val R²:     0.9553 | Pearson:    0.9814 | Spearman:   0.5157
  ➤ New best R²=0.9553; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 10/20: 100%|██████████| 62/62 [00:25<00:00,  2.47it/s]



Epoch 10 | Train RMSE: 0.3647 | Val RMSE:   0.2168 | Val R²:     0.9556 | Pearson:    0.9816 | Spearman:   0.5165
  ➤ New best R²=0.9556; saved model to /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth


Epoch 11/20: 100%|██████████| 62/62 [00:25<00:00,  2.45it/s]



Epoch 11 | Train RMSE: 0.3723 | Val RMSE:   0.2168 | Val R²:     0.9556 | Pearson:    0.9816 | Spearman:   0.5165


Epoch 12/20: 100%|██████████| 62/62 [00:20<00:00,  3.02it/s]



Epoch 12 | Train RMSE: 0.3640 | Val RMSE:   0.2171 | Val R²:     0.9554 | Pearson:    0.9816 | Spearman:   0.5164


Epoch 13/20: 100%|██████████| 62/62 [00:17<00:00,  3.45it/s]



Epoch 13 | Train RMSE: 0.3676 | Val RMSE:   0.2205 | Val R²:     0.9540 | Pearson:    0.9817 | Spearman:   0.5168


Epoch 14/20: 100%|██████████| 62/62 [00:17<00:00,  3.47it/s]



Epoch 14 | Train RMSE: 0.3684 | Val RMSE:   0.2171 | Val R²:     0.9554 | Pearson:    0.9818 | Spearman:   0.5167


Epoch 15/20: 100%|██████████| 62/62 [00:17<00:00,  3.51it/s]



Epoch 15 | Train RMSE: 0.3658 | Val RMSE:   0.2189 | Val R²:     0.9547 | Pearson:    0.9819 | Spearman:   0.5164

No improvement in 5 epochs. Stopping early.

Training complete. Best Val R² = 0.9556
Best model path: /content/drive/MyDrive/raw_work_donot_delete/frames_dataset/224_Frames_dynamic/best_vit_mos.pth
