# About this notebook
- PyTorch resnext50_32x4d starter code
- GroupKFold 4 folds

If this notebook is helpful, feel free to upvote :)

# Data Loading

In [2]:
import os

import pandas as pd

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/sample_submission.csv')

# Directory settings

In [4]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

#TRAIN_PATH = '../input/ranzcr-512x512-dataset'
TRAIN_PATH = 'data/train'


# CFG

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=True
    print_freq=100
    num_workers=0
    model_name='resnext50_32x4d'
    size=400
    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    epochs=6
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    T_max=6 # CosineAnnealingLR
    #T_0=6 # CosineAnnealingWarmRestarts
    lr=1e-4
    min_lr=1e-6
    batch_size=4
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    target_size=11
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
                 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
                 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
                 'Swan Ganz Catheter Present']
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 1
    train = train.sample(n=100, random_state=CFG.seed).reset_index(drop=True)

# Library

In [6]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

from torch.cuda.amp import autocast, GradScaler

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [7]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    scores = []
    for i in range(y_true.shape[1]):
        try:
            score = roc_auc_score(y_true[:,i], y_pred[:,i])
            scores.append(score)
        except ValueError:
            print(i)
        
        
    avg_score = np.mean(scores)
    return avg_score, scores


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# CV split

In [8]:


folds = train.copy()
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = folds['PatientID'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_cols], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
display(folds.groupby('fold').size())

fold
0    25
1    25
2    25
3    25
dtype: int64

# Dataset

In [9]:
# ====================================================
# Dataset
# ====================================================
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['StudyInstanceUID'].values
        self.labels = df[CFG.target_cols].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}.jpg'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).float()
        return image, label

# Transforms

In [10]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size, scale=(0.85, 1.0)),
            HorizontalFlip(p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

# MODEL

In [11]:
# ====================================================
# MODEL
# ====================================================
class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

# Helper functions

In [12]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    scaler = GradScaler()
    
    # switch to train mode
    model.train()
    
    progress_bar = tqdm(train_loader)
    
    losses = []
    
    for step, (images, labels) in enumerate(progress_bar):
        # measure data loading time
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with autocast():
            y_preds = model(images)
            loss = criterion(y_preds, labels)
        # record loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        
        
        losses.append(loss.item())
        smooth_loss = np.mean(losses[-30:])
        progress_bar.set_description(f'loss: {loss.item():.5f}, smth: {smooth_loss:.5f}')
        
    return np.mean(losses)


def valid_fn(valid_loader, model, criterion, device):
    # switch to evaluation mode
    model.eval()
    
    progress_bar = tqdm(valid_loader)
    
    preds = []
    losses = []
    
    for step, (images, labels) in enumerate(progress_bar):
        # measure data loading time
        images = images.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(images)
        loss = criterion(y_preds, labels)
        losses.append(loss.item())
        # record accuracy
        progress_bar.set_description(f'loss: {loss.item():.5f}')
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        
    predictions = np.concatenate(preds)
    return np.mean(losses), predictions

# Train loop

In [13]:
# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold):

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(train_folds, 
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds, 
                                 transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size * 2, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomResNext(CFG.model_name, pretrained=True)
    model.to(device)

    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score, scores = get_score(valid_labels, preds)
 
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            torch.save({'model': model.state_dict(), 
                        'preds': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    
    check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best.pth')
    for c in [f'pred_{c}' for c in CFG.target_cols]:
        valid_folds[c] = np.nan
    valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds']

    return valid_folds

In [14]:
# ====================================================
# main
# ====================================================
def main():

    """
    Prepare: 1.train  2.folds
    """

    def get_result(result_df):
        preds = result_df[[f'pred_{c}' for c in CFG.target_cols]].values
        labels = result_df[CFG.target_cols].values
        score, scores = get_score(labels, preds)
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                get_result(_oof_df)
        # CV result
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)

In [14]:
if __name__ == '__main__':
    main()

loss: 0.34738, smth: 0.51163: 100%|██████████| 18/18 [00:39<00:00,  2.22s/it]
loss: 0.15793: 100%|██████████| 4/4 [00:02<00:00,  1.85it/s]


0
3
4
0
3
4


loss: 0.30217, smth: 0.50949: 100%|██████████| 18/18 [00:38<00:00,  2.15s/it]
loss: 0.39098: 100%|██████████| 4/4 [00:02<00:00,  1.90it/s]


0
3
4
0
3
4


loss: 0.65345, smth: 0.68210:  17%|█▋        | 3/18 [00:08<00:43,  2.88s/it]


KeyboardInterrupt: 

In [15]:
oof = pd.read_csv('oof_df.csv')

In [16]:
oof

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,...,pred_ETT - Borderline,pred_ETT - Normal,pred_NGT - Abnormal,pred_NGT - Borderline,pred_NGT - Incompletely Imaged,pred_NGT - Normal,pred_CVC - Abnormal,pred_CVC - Borderline,pred_CVC - Normal,pred_Swan Ganz Catheter Present
0,1.2.826.0.1.3680043.8.498.12062735938247288574...,0,0,0,0,0,0,0,1,0,...,0.090071,0.351044,0.090335,0.085504,0.237534,0.160310,0.217420,0.209095,0.767686,0.087943
1,1.2.826.0.1.3680043.8.498.12261966456376427418...,0,0,0,0,0,0,0,0,0,...,0.047286,0.296513,0.037479,0.041369,0.184670,0.142708,0.150384,0.203658,0.779578,0.047851
2,1.2.826.0.1.3680043.8.498.41042511967315281006...,0,0,0,0,0,0,0,0,0,...,0.045523,0.219583,0.052283,0.054558,0.210149,0.126224,0.158763,0.234302,0.821327,0.052988
3,1.2.826.0.1.3680043.8.498.33395752605080511634...,0,0,0,0,0,0,1,1,1,...,0.049815,0.301751,0.063910,0.048379,0.194176,0.154946,0.140409,0.240994,0.805753,0.064665
4,1.2.826.0.1.3680043.8.498.12786707235059185574...,0,0,0,0,0,0,0,0,1,...,0.073287,0.250406,0.057670,0.062815,0.186268,0.139078,0.187163,0.241002,0.770120,0.071849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.2.826.0.1.3680043.8.498.30440955038019441887...,0,0,0,0,0,0,0,0,0,...,0.051844,0.099541,0.048328,0.032392,0.049238,0.062926,0.064201,0.158358,0.873588,0.048062
96,1.2.826.0.1.3680043.8.498.25293340653252895771...,0,1,0,0,0,0,1,0,0,...,0.053195,0.159721,0.032550,0.030838,0.041230,0.091780,0.062114,0.121545,0.892970,0.054452
97,1.2.826.0.1.3680043.8.498.90855111132163997581...,0,0,1,0,0,1,0,0,0,...,0.061409,0.139958,0.059453,0.046548,0.061353,0.107965,0.098531,0.160864,0.864314,0.064115
98,1.2.826.0.1.3680043.8.498.10764130667321392143...,0,0,0,0,0,0,0,1,0,...,0.067294,0.177097,0.053040,0.041260,0.047553,0.102829,0.083989,0.131890,0.912400,0.058846


In [17]:
preds = oof[[f'pred_{c}' for c in CFG.target_cols]].values
labels = oof[CFG.target_cols].values
score, scores = get_score(labels, preds)

0


In [18]:
score

0.5059441020890516

In [20]:
preds[:, 0]

array([0.087513  , 0.0515033 , 0.04688244, 0.05143243, 0.06017136,
       0.04742864, 0.052565  , 0.04839828, 0.04098251, 0.07034287,
       0.13008662, 0.06335313, 0.05159475, 0.08106221, 0.03875271,
       0.12160487, 0.07755381, 0.02878727, 0.03772036, 0.05654447,
       0.07541224, 0.03744994, 0.07439245, 0.03348478, 0.05966498,
       0.03043422, 0.06460388, 0.08215841, 0.02559223, 0.10560989,
       0.08013184, 0.04747726, 0.04712925, 0.04174506, 0.05693524,
       0.06785142, 0.10215937, 0.03164821, 0.04435433, 0.04983869,
       0.06027292, 0.04376763, 0.06310797, 0.04561419, 0.02328596,
       0.07098681, 0.08587869, 0.04366614, 0.05079442, 0.06034593,
       0.08360209, 0.08105859, 0.03483585, 0.02583926, 0.04693085,
       0.0717994 , 0.03570352, 0.0982999 , 0.11059007, 0.08348852,
       0.08237597, 0.09916498, 0.05116217, 0.02071713, 0.04143083,
       0.04408927, 0.10074294, 0.0587924 , 0.06357034, 0.12634082,
       0.08822627, 0.1071081 , 0.06153767, 0.11367757, 0.03521

In [23]:
labels[:, 2]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)