In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import copy
import pickle
import shutil
import torch
import torch.optim as optim
import torchvision
from distutils.dir_util import copy_tree
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from torchvision import datasets, models, transforms

In [2]:
data_dir = 'C:/Users/joey3/Desktop/Apnea'
raw_dir = 'C:/Users/joey3/Desktop/Apnea_Raw'
train_df = pd.read_csv('../resources/File_train.csv')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=5, batch_size=8):
    # Preparation work
    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ]),
        'val': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ]),
    }

    image_datasets = {
        x: datasets.ImageFolder(
            os.path.join(data_dir, x), 
            transform=data_transforms[x],
        ) 
        for x in ['train', 'val']
    }
    dataloaders = {
        x: torch.utils.data.DataLoader(
            image_datasets[x], 
            batch_size=batch_size,
            shuffle=True, 
            num_workers=8,
        )
        for x in ['train', 'val']
    }
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    class_names = image_datasets['train'].classes
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    since = time.time()

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [4]:
def val_performance(model, batch_size=8):
    model.eval()   # Set model to evaluate mode

    running_count, running_corrects = 0, 0
    y_true, y_pred_prob = [], []
    # Iterate over testing patients
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    image_datasets = datasets.ImageFolder(
        f'{data_dir}/val', 
        transform=data_transforms,
    ) 
    dataloaders = torch.utils.data.DataLoader(
        image_datasets, 
        batch_size=batch_size,
        shuffle=True, 
        num_workers=8,
    )

    # Iterate over data.
    for inputs, labels in dataloaders:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

        # statistics
        running_count += len(preds)
        running_corrects += torch.sum(preds == labels.data)
        
        # Record result
        y_true += labels.data.tolist()
        y_pred_prob += outputs.tolist()
        
    epoch_acc = running_corrects.double() / running_count
    print('Testing Acc: {:.4f}'.format(epoch_acc))
    return y_true, y_pred_prob

In [5]:
def train_model_cv(file_df, model, criterion, optimizer, scheduler, num_epochs=5, batch_size=8, n=4):
    skf = StratifiedKFold(n_splits=n)
    y_true, y_pred = [], []
    for idx_train, idx_val in skf.split(file_df, file_df['group']):
        print('*' * 20)
        print('Prepare folders')
        shutil.rmtree(data_dir)
        os.mkdir(data_dir)
        os.mkdir(f'{data_dir}/train')
        os.mkdir(f'{data_dir}/train/0')
        os.mkdir(f'{data_dir}/train/1')
        os.mkdir(f'{data_dir}/val')
        os.mkdir(f'{data_dir}/val/0')
        os.mkdir(f'{data_dir}/val/1')
        file_train, file_val = file_df.loc[idx_train, 'file'], file_df.loc[idx_val, 'file']
        
        # Prepare training image folders
        for file in list(file_train):
            copy_tree(f'{raw_dir}/{file}/0/', f'{data_dir}/train/0/', verbose=0)
            copy_tree(f'{raw_dir}/{file}/1/', f'{data_dir}/train/1/', verbose=0)
        
        # Prepare validation image folders
        for file in list(file_val):
            copy_tree(f'{raw_dir}/{file}/0/', f'{data_dir}/val/0/', verbose=0)
            copy_tree(f'{raw_dir}/{file}/1/', f'{data_dir}/val/1/', verbose=0)
        
        # Model training and reporting results on the validation set
        print('Training starts')
        model_trained = train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs, batch_size=batch_size)
        y_true_, y_pred_prob_ = val_performance(model_trained, batch_size=batch_size)
        y_true += y_true_
        y_pred_prob += y_pred_prob_
        
    return y_true, y_pred_prob

# Finetuning 

In [6]:
# SqueezeNet
model_ft = models.squeezenet1_0(pretrained=True)
model_ft.classifier[1] = torch.nn.Conv2d(512, 2, kernel_size=(1,1), stride=(1,1))
model_ft.num_classes = 2
model_ft = model_ft.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

y_true, y_pred_prob = train_model_cv(
    train_df, model_ft, 
    criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20, 
    batch_size=8, n=4)

********************
Prepare folders
Training starts
Epoch 0/19
----------
train Loss: 0.4282 Acc: 0.8078
val Loss: 0.5154 Acc: 0.7930

Epoch 1/19
----------
train Loss: 0.3655 Acc: 0.8456
val Loss: 0.4641 Acc: 0.8057

Epoch 2/19
----------
train Loss: 0.3496 Acc: 0.8518
val Loss: 0.4753 Acc: 0.8086

Epoch 3/19
----------
train Loss: 0.3404 Acc: 0.8562
val Loss: 0.4789 Acc: 0.8074

Epoch 4/19
----------
train Loss: 0.3329 Acc: 0.8577
val Loss: 0.4989 Acc: 0.8119

Epoch 5/19
----------
train Loss: 0.3294 Acc: 0.8639
val Loss: 0.4585 Acc: 0.8170

Epoch 6/19
----------
train Loss: 0.3209 Acc: 0.8654
val Loss: 0.5075 Acc: 0.8149

Epoch 7/19
----------
train Loss: 0.2971 Acc: 0.8760
val Loss: 0.4874 Acc: 0.8212

Epoch 8/19
----------
train Loss: 0.2911 Acc: 0.8798
val Loss: 0.4773 Acc: 0.8189

Epoch 9/19
----------
train Loss: 0.2894 Acc: 0.8823
val Loss: 0.4818 Acc: 0.8203

Epoch 10/19
----------
train Loss: 0.2872 Acc: 0.8810
val Loss: 0.4878 Acc: 0.8245

Epoch 11/19
----------
train Loss

UnboundLocalError: local variable 'y_pred_prob' referenced before assignment

In [None]:
res = {'y_true': y_true, 'y_pred_prob': y_pred_prob}
with open('../archive/Model_CNN_CV_prob.pkl', 'wb') as f:
    pickle.dump(res, f)

In [None]:
# print(metrics.classification_report(y_true, y_pred))