In [None]:
# ─── Cell 1: Imports & Globals ───────────────────────────────────────────────

!pip install librosa einops

import os, sys
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import librosa
import matplotlib.pyplot as plt
from torchvision import transforms
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Paths (adjust if needed)
DATA_DIR   = '/kaggle/input/noloudnormser/'
AUDIO_DIR  = os.path.join(DATA_DIR, 'audio_files')
LABEL_CSV  = os.path.join(DATA_DIR, 'validation_labels.csv')
CKPT_PATH  = '/kaggle/input/student_94.07_crema_d.ckpt/pytorch/default/1/student_94.07_CREMA_D.ckpt'  # or '/kaggle/input/.../student_94.07_CREMA_D.ckpt'
SPEC_DIR   = '/kaggle/working/spec_images'
os.makedirs(SPEC_DIR, exist_ok=True)


In [28]:
# ─── Cell 2: Load & Filter Labels ────────────────────────────────────────────

df = pd.read_csv(LABEL_CSV)
# keep only the six target emotions
df = df[df['emotion_label'] != 'surprise'].reset_index(drop=True)

# Quick sanity
print(f"Total samples: {len(df)}")
print(df['emotion_label'].value_counts())


Total samples: 180
emotion_label
angry      30
disgust    30
fear       30
happy      30
neutral    30
sad        30
Name: count, dtype: int64


In [29]:
# ─── New Cell 3: “As‐written” Log‑Mel per paper (magnitude→20·log₁₀, no clip/scale, center‐crop) ────────────

spec_tensors = []
labels       = []

for _, row in df.iterrows():
    # 1. Load & pad audio to 4 s
    y, _ = librosa.load(os.path.join(AUDIO_DIR, row['filename']), sr=16000)
    if len(y) < 4*16000:
        y = np.pad(y, (0, 4*16000 - len(y)), mode='constant')
    else:
        y = y[:4*16000]

    # 2. STFT
    S = librosa.stft(y,
                     n_fft=1024,
                     hop_length=64,
                     win_length=512,
                     window='hamming')

    # 3. Mel filter on magnitude (power=1.0)
    M = librosa.feature.melspectrogram(
            S=np.abs(S),
            sr=16000,
            n_mels=128,
            power=1.0
        )

    # 4. Convert to dB via amplitude_to_db (20·log10)
    logM = librosa.amplitude_to_db(M, ref=1.0)

    # 5. Center‑crop or pad time‐axis to exactly 128 frames
    T = logM.shape[1]
    if T >= 128:
        start = (T - 128) // 2
        logM = logM[:, start : start + 128]
    else:
        pad_left  = (128 - T) // 2
        pad_right = 128 - T - pad_left
        logM = np.pad(logM, ((0,0),(pad_left, pad_right)), mode='constant')

    # 6. Store raw dB values (no clipping, no extra scaling)
    spec_tensors.append(logM.astype(np.float32))
    labels.append(int(row['emotion_id']))

# Sanity check
print(f"Built {len(spec_tensors)} examples, each shape {spec_tensors[0].shape}")


Loaded 180 spectrograms, each of shape (128, 128)


In [30]:
# ─── Updated Cell 4: Dataset & DataLoader (raw tensors) ───────────────────

class RawSpecDataset(Dataset):
    def __init__(self, specs, targets):
        self.specs   = specs
        self.targets = targets
    def __len__(self):
        return len(self.specs)
    def __getitem__(self, idx):
        # specs[idx] is a 128×128 numpy array in [-1,1]
        img = torch.from_numpy(self.specs[idx]).unsqueeze(0)  # shape (1,128,128)
        label    = self.targets[idx]
        filename = df.loc[idx, 'filename']
        return {'img': img, 'label': label, 'filename': filename}

dataset = RawSpecDataset(spec_tensors, labels)
loader  = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=2)

# Quick check
batch = next(iter(loader))
print(batch['img'].shape, batch['img'].min().item(), batch['img'].max().item())


torch.Size([16, 1, 128, 128]) -0.9011074304580688 1.0


In [31]:
# Took from vit-pytorch github repository
# https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py
# Some modification exist

import math
import torch
from torch import nn

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import Dataset

# helpers

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# classes

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.norm = nn.LayerNorm(dim)

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        x = self.norm(x)

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x

        return self.norm(x)

# CreateCoords took from CoordConv github repository
# https://github.com/walsvid/CoordConv
# Some modifications exist

def CreateCoords(max_bs=32, x_dim=64, y_dim=64, with_r=False, skiptile=False):
    """Add coords to a tensor"""
    # self.x_dim = x_dim
    # self.y_dim = y_dim
    # self.with_r = with_r
    # self.skiptile = skiptile

    batch_size_tensor = max_bs  # Get batch size
                                # If you want larger batch, change max_bs

    xx_ones = torch.ones([1, x_dim], dtype=torch.int32)
    xx_ones = xx_ones.unsqueeze(-1)

    xx_range = torch.arange(y_dim, dtype=torch.int32).unsqueeze(0)
    xx_range = xx_range.unsqueeze(1)

    xx_channel = torch.matmul(xx_ones, xx_range)
    xx_channel = xx_channel.unsqueeze(-1)

    yy_ones = torch.ones([1, y_dim], dtype=torch.int32)
    yy_ones = yy_ones.unsqueeze(1)

    yy_range = torch.arange(x_dim, dtype=torch.int32).unsqueeze(0)
    yy_range = yy_range.unsqueeze(-1)

    yy_channel = torch.matmul(yy_range, yy_ones)
    yy_channel = yy_channel.unsqueeze(-1)

    xx_channel = xx_channel.permute(0, 3, 2, 1)
    yy_channel = yy_channel.permute(0, 3, 2, 1)

    xx_channel = xx_channel.float() / (x_dim - 1)
    yy_channel = yy_channel.float() / (y_dim - 1)

    xx_channel = xx_channel * 2 - 1
    yy_channel = yy_channel * 2 - 1

    coords = torch.cat([xx_channel, yy_channel], dim=1)
    coords = coords.repeat(batch_size_tensor, 1, 1, 1)

    return coords.to('cuda')

def sinusoidal_pe(d_model, length):
    """
    :param d_model: dimension of the model
    :param length: length of positions
    :return: length*d_model position matrix
    """
    if d_model % 2 != 0:
        raise ValueError("Cannot use sin/cos positional encoding with "
                         "odd dim (got dim={:d})".format(d_model))
    pe = torch.zeros(length, d_model)
    position = torch.arange(0, length).unsqueeze(1)
    div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
                         -(math.log(10000.0) / d_model)))
    pe[:, 0::2] = torch.sin(position.float() * div_term)
    pe[:, 1::2] = torch.cos(position.float() * div_term)

    return pe.to('cuda')

class CustomDataset(Dataset):
    def __init__(self, img_list, trg_list):
        self.img = img_list
        self.trg = trg_list
        self.transforms = transforms.ToTensor()

    def __len__(self):
        return len(self.img)
    
    def __getitem__(self, idx):
        img = plt.imread(self.img[idx])[:,:,:1]
        img = self.transforms(img)
        trg = self.trg[idx]
        return {"img": img, "trg": trg}

class Teacher(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., max_bs = 32):
        super().__init__()
        image_height, image_width = image_size
        patch_height, patch_width = patch_size

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        self.conv_stem = nn.Sequential(
            nn.Conv2d(channels, 16, 3, 1, 1),
            nn.InstanceNorm2d(16),
            nn.GELU(),
            nn.Conv2d(16, 32, 3, 1, 1),
            nn.InstanceNorm2d(32),
            nn.GELU(),
            nn.Conv2d(32, 64, 3, 1, 1),
            nn.InstanceNorm2d(64),
            nn.GELU(),
            nn.Conv2d(64, 32, 3, 1, 1),
            nn.InstanceNorm2d(32),
            nn.GELU(),
            nn.Conv2d(32, 16, 3, 1, 1),
            nn.InstanceNorm2d(16),
            nn.GELU(),
            nn.Conv2d(16, 1, 3, 1, 1),
            nn.InstanceNorm2d(1),
            nn.GELU(),
        )

        self.coords = CreateCoords(max_bs=max_bs, x_dim=image_width, y_dim=image_height)

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = (channels+2) * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool

        self.mlp_head = nn.Linear(dim, num_classes)

    def encoder(self, img):
        x = self.conv_stem(img)
        # x = img
        x = torch.cat((x,self.coords[:x.size(0)]), dim=1)
        x = self.to_patch_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
        
        return x
    
    def decoder(self, x):
        y = x
        x = self.mlp_head(x)

        return x, y

    def forward(self, img):
        x = self.encoder(img)
        x, y = self.decoder(x)
        return x, y

class Student(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., max_bs = 32):
        super().__init__()
        image_height, image_width = image_size
        patch_height, patch_width = patch_size

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = (channels+0) * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool

        self.mlp_head = nn.Linear(dim, num_classes)

    def encoder(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
        
        return x
    
    def decoder(self, x):
        y = x
        x = self.mlp_head(x)

        return x, y

    def forward(self, img):
        x = self.encoder(img)
        x, y = self.decoder(x)
        return x, y


In [32]:
# ─── Cell 5: Instantiate Model & Load Checkpoint (UPDATED) ────────────

# Student class is already defined above in your pasted model.py
student = Student(
    image_size=(128,128),
    patch_size=(128,1),
    num_classes=6,    # angry,disgust,fear,happy,neutral,sad
    dim=256,
    depth=3,
    heads=5,
    mlp_dim=256,      # <-- MATCHES checkpoint’s hidden-dim
    channels=1,
    emb_dropout=0.,
    dropout=0.
).to(device)

# Load the CREMA-D student checkpoint
ckpt = torch.load(CKPT_PATH, map_location=device)
student.load_state_dict(ckpt['model_state_dict'])
student.eval()


Student(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=128, p2=1)
    (1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (transformer): Transformer(
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-2): 3 x ModuleList(
        (0): Attention(
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (attend): Softmax(dim=-1)
          (dropout): Dropout(p=0.0, inplace=False)
          (to_qkv): Linear(in_features=256, out_features=960, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=320, out_features=256, bias=True)
            (1): Dropout(p=0.0, inplace=False)
          )
        )
        (1): FeedForward(
          (net): Sequential(
           

In [33]:
# ─── Cell 6: Inference & Evaluation ─────────────────────────────────────────

all_files, all_true, all_pred = [], [], []

with torch.no_grad():
    for batch in loader:
        imgs = batch['img'].to(device)            # (B,1,128,128)
        logits, _ = student(imgs)                 # (B,6)
        preds = logits.argmax(dim=-1).cpu().numpy()
        all_files.extend(batch['filename'])
        all_true.extend(batch['label'])
        all_pred.extend(preds.tolist())

# Save predictions
results = pd.DataFrame({
    'filename': all_files,
    'true_id': all_true,
    'pred_id': all_pred
})
results.to_csv('/kaggle/working/predictions.csv', index=False)

# Metrics
print(f"Overall accuracy: {accuracy_score(all_true, all_pred):.4f}")
print(classification_report(
    all_true, all_pred,
    target_names=['angry','disgust','fear','happy','neutral','sad']
))


Overall accuracy: 0.1778
              precision    recall  f1-score   support

       angry       0.25      0.17      0.20        30
     disgust       0.15      0.27      0.19        30
        fear       0.11      0.07      0.08        30
       happy       0.25      0.07      0.11        30
     neutral       0.17      0.03      0.06        30
         sad       0.19      0.47      0.27        30

    accuracy                           0.18       180
   macro avg       0.19      0.18      0.15       180
weighted avg       0.19      0.18      0.15       180



In [34]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# 1. Load the CSV
df = pd.read_csv('/kaggle/working/predictions.csv')

# 2. Robustly extract the integer from each cell (works for "tensor(3)" or "3")
df['true_id'] = df['true_id'].apply(lambda x: int(''.join(filter(str.isdigit, str(x)))))
df['pred_id'] = df['pred_id'].apply(lambda x: int(''.join(filter(str.isdigit, str(x)))))

# 3. Compute confusion matrix
labels = ['angry','disgust','fear','happy','neutral','sad']
cm = confusion_matrix(df['true_id'], df['pred_id'], labels=range(len(labels)))

# 4. Display
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix (rows=true, columns=predicted):\n")
print(cm_df)


Confusion Matrix (rows=true, columns=predicted):

         angry  disgust  fear  happy  neutral  sad
angry        5        7     5      0        0   13
disgust      3        8     3      2        2   12
fear         3       15     2      1        0    9
happy        2       10     4      2        2   10
neutral      4        5     2      2        1   16
sad          3        8     3      1        1   14
