In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os
import copy
import pickle
import shutil
import torch
import torch.optim as optim
import torchvision
from distutils.dir_util import copy_tree
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics
from torchvision import datasets, models, transforms

In [2]:
train_dir = 'C:/Users/joey3/Desktop/Apnea_Train'
test_dir = 'C:/Users/joey3/Desktop/Apnea_Test'
raw_dir = 'C:/Users/joey3/Desktop/Apnea_Raw'
train_df = pd.read_csv('../resources/File_train.csv')
test_df = pd.read_csv('../resources/File_test.csv')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
shutil.rmtree(test_dir)
os.mkdir(test_dir)
os.mkdir(f'{test_dir}/0')
os.mkdir(f'{test_dir}/1')

# Prepare testing image folders
for file in test_df['file']:
    copy_tree(f'{raw_dir}/{file}/0/', f'{test_dir}/0/', verbose=0)
    copy_tree(f'{raw_dir}/{file}/1/', f'{test_dir}/1/', verbose=0)


In [3]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=5, batch_size=8):
    # Preparation work
    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ]),
        'val': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ]),
    }

    image_datasets = {
        x: datasets.ImageFolder(
            os.path.join(train_dir, x), 
            transform=data_transforms[x],
        ) 
        for x in ['train', 'val']
    }
    dataloaders = {
        x: torch.utils.data.DataLoader(
            image_datasets[x], 
            batch_size=batch_size,
            shuffle=True, 
            num_workers=8,
        )
        for x in ['train', 'val']
    }
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    class_names = image_datasets['train'].classes
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    since = time.time()

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [4]:
def val_performance(model, batch_size=8):
    model.eval()   # Set model to evaluate mode

    running_count, running_corrects = 0, 0
    y_true, y_pred = [], []
    # Iterate over testing patients
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    image_datasets = datasets.ImageFolder(
        f'{train_dir}/val', 
        transform=data_transforms,
    ) 
    dataloaders = torch.utils.data.DataLoader(
        image_datasets, 
        batch_size=batch_size,
        shuffle=True, 
        num_workers=8,
    )

    # Iterate over data.
    for inputs, labels in dataloaders:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

        # statistics
        running_count += len(preds)
        running_corrects += torch.sum(preds == labels.data)
        
        # Record result
        y_true += labels.data.tolist()
        y_pred += preds.tolist()

    epoch_acc = running_corrects.double() / running_count
    print('Validation Acc: {:.4f}'.format(epoch_acc))
    return y_true, y_pred

In [7]:
def test_performance(model, batch_size=8):
    model.eval()   # Set model to evaluate mode

    y_true, y_pred = [], []
    # Iterate over testing patients
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    image_datasets = datasets.ImageFolder(
        test_dir, 
        transform=data_transforms,
    ) 
    dataloaders = torch.utils.data.DataLoader(
        image_datasets, 
        batch_size=batch_size,
        shuffle=True, 
        num_workers=8,
    )

    # Iterate over data.
    for inputs, labels in dataloaders:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

        # Record result
        y_true += labels.data.tolist()
        y_pred += preds.tolist()

    return y_true, y_pred

In [5]:
def train_model_cv(file_df, model, criterion, optimizer, scheduler, num_epochs=5, batch_size=8, n=4):
    skf = StratifiedKFold(n_splits=n)
    y_true, y_pred = [], []
    for idx_train, idx_val in skf.split(file_df, file_df['group']):
        print('*' * 20)
        print('Prepare folders')
        shutil.rmtree(train_dir)
        os.mkdir(train_dir)
        os.mkdir(f'{train_dir}/train')
        os.mkdir(f'{train_dir}/train/0')
        os.mkdir(f'{train_dir}/train/1')
        os.mkdir(f'{train_dir}/val')
        os.mkdir(f'{train_dir}/val/0')
        os.mkdir(f'{train_dir}/val/1')
        file_train, file_val = file_df.loc[idx_train, 'file'], file_df.loc[idx_val, 'file']
        
        # Prepare training image folders
        for file in list(file_train):
            copy_tree(f'{raw_dir}/{file}/0/', f'{train_dir}/train/0/', verbose=0)
            copy_tree(f'{raw_dir}/{file}/1/', f'{train_dir}/train/1/', verbose=0)
        
        # Prepare validation image folders
        for file in list(file_val):
            copy_tree(f'{raw_dir}/{file}/0/', f'{train_dir}/val/0/', verbose=0)
            copy_tree(f'{raw_dir}/{file}/1/', f'{train_dir}/val/1/', verbose=0)
        
        # Model training and reporting results on the validation set
        print('Training starts')
        model_trained = train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs, batch_size=batch_size)
        y_true_, y_pred_ = val_performance(model_trained, batch_size=batch_size)
        y_true += y_true_
        y_pred += y_pred_
        
    return y_true, y_pred 

In [4]:
def train_model_single_split(file_df, model, criterion, optimizer, scheduler, num_epochs=5, batch_size=8):
    y_true, y_pred = [], []
    file_train, file_val = train_test_split(file_df['file'], test_size=0.25, random_state=1, stratify=file_df['group'])
    
    print('*' * 20)
    print('Prepare folders')
    shutil.rmtree(train_dir)
    os.mkdir(train_dir)
    os.mkdir(f'{train_dir}/train')
    os.mkdir(f'{train_dir}/train/0')
    os.mkdir(f'{train_dir}/train/1')
    os.mkdir(f'{train_dir}/val')
    os.mkdir(f'{train_dir}/val/0')
    os.mkdir(f'{train_dir}/val/1')
    
    # Prepare training image folders
    for file in list(file_train):
        copy_tree(f'{raw_dir}/{file}/0/', f'{train_dir}/train/0/', verbose=0)
        copy_tree(f'{raw_dir}/{file}/1/', f'{train_dir}/train/1/', verbose=0)

    # Prepare validation image folders
    for file in list(file_val):
        copy_tree(f'{raw_dir}/{file}/0/', f'{train_dir}/val/0/', verbose=0)
        copy_tree(f'{raw_dir}/{file}/1/', f'{train_dir}/val/1/', verbose=0)

    # Model training and reporting results on the validation set
    print('Training starts')
    model_trained = train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs, batch_size=batch_size)
        
    return model_trained

# Finetuning 

In [5]:
# SqueezeNet
model_ft = models.squeezenet1_0(pretrained=True)
model_ft.classifier[1] = torch.nn.Conv2d(512, 2, kernel_size=(1,1), stride=(1,1))
model_ft.num_classes = 2
model_ft = model_ft.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [6]:
# Train a model and test accuracy for testing set
model_trained = train_model_single_split(
    train_df, model_ft, 
    criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20, 
    batch_size=8)


********************
Prepare folders
Training starts
Epoch 0/19
----------
train Loss: 0.4856 Acc: 0.7785
val Loss: 0.5294 Acc: 0.7300

Epoch 1/19
----------
train Loss: 0.4272 Acc: 0.8162
val Loss: 0.2702 Acc: 0.8962

Epoch 2/19
----------
train Loss: 0.4133 Acc: 0.8250
val Loss: 0.2538 Acc: 0.9037

Epoch 3/19
----------
train Loss: 0.4018 Acc: 0.8332
val Loss: 0.2663 Acc: 0.8966

Epoch 4/19
----------
train Loss: 0.3923 Acc: 0.8341
val Loss: 0.2695 Acc: 0.9040

Epoch 5/19
----------
train Loss: 0.3852 Acc: 0.8384
val Loss: 0.2708 Acc: 0.8954

Epoch 6/19
----------
train Loss: 0.3830 Acc: 0.8401
val Loss: 0.2833 Acc: 0.8976

Epoch 7/19
----------
train Loss: 0.3538 Acc: 0.8547
val Loss: 0.2520 Acc: 0.9021

Epoch 8/19
----------
train Loss: 0.3493 Acc: 0.8545
val Loss: 0.2574 Acc: 0.9011

Epoch 9/19
----------
train Loss: 0.3476 Acc: 0.8573
val Loss: 0.2587 Acc: 0.9022

Epoch 10/19
----------
train Loss: 0.3452 Acc: 0.8559
val Loss: 0.2715 Acc: 0.8953

Epoch 11/19
----------
train Loss

In [8]:
y_true, y_pred = test_performance(model_trained, batch_size=8)

In [11]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      4171
           1       0.85      0.70      0.77      2699

    accuracy                           0.84      6870
   macro avg       0.84      0.81      0.82      6870
weighted avg       0.84      0.84      0.83      6870



## Cross-validation

In [9]:
y_true, y_pred = train_model_cv(
    train_df, model_ft, 
    criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20, 
    batch_size=8, n=4)

********************
Prepare folders
Training starts
Epoch 0/19
----------
train Loss: 0.4037 Acc: 0.8212
val Loss: 0.4924 Acc: 0.7634

Epoch 1/19
----------
train Loss: 0.3607 Acc: 0.8456
val Loss: 0.4684 Acc: 0.8114

Epoch 2/19
----------
train Loss: 0.3438 Acc: 0.8558
val Loss: 0.4849 Acc: 0.8072

Epoch 3/19
----------
train Loss: 0.3376 Acc: 0.8578
val Loss: 0.4738 Acc: 0.8038

Epoch 4/19
----------
train Loss: 0.3287 Acc: 0.8599
val Loss: 0.4443 Acc: 0.8257

Epoch 5/19
----------
train Loss: 0.3235 Acc: 0.8608
val Loss: 0.4531 Acc: 0.8313

Epoch 6/19
----------
train Loss: 0.3161 Acc: 0.8688
val Loss: 0.4589 Acc: 0.8263

Epoch 7/19
----------
train Loss: 0.2905 Acc: 0.8813
val Loss: 0.4622 Acc: 0.8247

Epoch 8/19
----------
train Loss: 0.2871 Acc: 0.8815
val Loss: 0.4819 Acc: 0.8244

Epoch 9/19
----------
train Loss: 0.2858 Acc: 0.8834
val Loss: 0.4666 Acc: 0.8272

Epoch 10/19
----------
train Loss: 0.2827 Acc: 0.8839
val Loss: 0.4812 Acc: 0.8290

Epoch 11/19
----------
train Loss

In [11]:
res = {'y_true': y_true, 'y_pred': y_pred}
with open('../archive/Model_CNN_CV.pkl', 'wb') as f:
    pickle.dump(res, f)

In [13]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89     16549
           1       0.85      0.79      0.82     10291

    accuracy                           0.87     26840
   macro avg       0.86      0.85      0.86     26840
weighted avg       0.87      0.87      0.86     26840

