In [1]:
import torch

from torch.utils.data import Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

PyTorch Version:  1.1.0
Torchvision Version:  0.2.2
Device: cuda:0


In [3]:
data_dir = "data/train/spectograms"

model_name = "custom"

num_classes = 10

batch_size = 32

num_epochs = 50

feature_extract = False # only update the reshaped layer params

In [4]:
class ConvNet(nn.Module):
    
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv_layers = nn.Sequential(
            # input.size: 3x224x224
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            # output: 16x224x224
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 16x112x112
            
            
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            # output: 128x112x112
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),
            #output: 32x56x56
            
            
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x56x56
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x56x56
            nn.ReLU(),
            nn.BatchNorm2d(64),            
            nn.MaxPool2d(kernel_size=2, stride=2),
            #output: 64x28x28

            
            
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            # output: 64x14x14
            nn.ReLU(),
            nn.BatchNorm2d(64),            
            nn.AdaptiveAvgPool2d((1,1))
            # output: 256x7x7
            
            
            
        )
        self.linear_layer = nn.Sequential(
            nn.Linear(64, 10)
        )
    
    def forward(self, input):
        output = self.conv_layers(input)
        output = output.view(input.size(0), -1)
        output = self.linear_layer(output)
        return output

In [5]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    since = time.time()
    
    val_acc_history = []
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-'*10)
        
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            running_loss = 0.0
            running_corrects = 0
            
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
            
        print()
    
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [6]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [7]:
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    model_ft = None
    input_size = 0
    
    if model_name == "squeezenet":
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = 224
        
    if model_name == "resnet":
        model_ft = models.resnet34(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224
        
    if model_name == "custom":
        model_ft = ConvNet()
        input_size = 224
        
    return model_ft, input_size

model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=False)

In [8]:
batch_size = 32


data_transforms = transforms.Compose([
    transforms.Resize(input_size),
    transforms.CenterCrop(input_size),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

print("Initializing Datasets and Dataloaders...")

image_dataset = datasets.ImageFolder(data_dir, data_transforms)

validation_split = 0.2
random_seed = 42

dataset_size = len(image_dataset)
split = int(validation_split * dataset_size)

np.random.seed(random_seed)
indices = np.random.permutation(dataset_size)

train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

train_dataset, val_dataset = torch.utils.data.random_split(image_dataset, [train_size, val_size])

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

dataloaders_dict = {}
dataloaders_dict['train'] = train_dataloader
dataloaders_dict['val'] = val_dataloader

Initializing Datasets and Dataloaders...


In [9]:
len(train_dataloader)/batch_size

4.25

In [10]:
# Send the model to GPU
model_ft = model_ft.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=0.01, momentum=0.9)

Params to learn:
	 conv_layers.0.weight
	 conv_layers.0.bias
	 conv_layers.2.weight
	 conv_layers.2.bias
	 conv_layers.4.weight
	 conv_layers.4.bias
	 conv_layers.6.weight
	 conv_layers.6.bias
	 conv_layers.8.weight
	 conv_layers.8.bias
	 conv_layers.10.weight
	 conv_layers.10.bias
	 conv_layers.11.weight
	 conv_layers.11.bias
	 conv_layers.13.weight
	 conv_layers.13.bias
	 conv_layers.15.weight
	 conv_layers.15.bias
	 conv_layers.17.weight
	 conv_layers.17.bias
	 linear_layer.0.weight
	 linear_layer.0.bias


In [11]:
criterion = nn.CrossEntropyLoss()

model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)

Epoch 0/49
----------
train Loss: 1.8167 Acc: 0.3540
val Loss: 1.7845 Acc: 0.3339

Epoch 1/49
----------
train Loss: 1.3813 Acc: 0.5168
val Loss: 1.3709 Acc: 0.5023

Epoch 2/49
----------
train Loss: 1.1472 Acc: 0.6083
val Loss: 1.1516 Acc: 0.5805

Epoch 3/49
----------
train Loss: 0.9792 Acc: 0.6711
val Loss: 0.9524 Acc: 0.6329

Epoch 4/49
----------
train Loss: 0.8289 Acc: 0.7298
val Loss: 1.1186 Acc: 0.6035

Epoch 5/49
----------
train Loss: 0.7262 Acc: 0.7659
val Loss: 0.8842 Acc: 0.6937

Epoch 6/49
----------
train Loss: 0.6394 Acc: 0.7974
val Loss: 0.6640 Acc: 0.7700

Epoch 7/49
----------
train Loss: 0.5630 Acc: 0.8238
val Loss: 0.5644 Acc: 0.8188

Epoch 8/49
----------
train Loss: 0.5056 Acc: 0.8461
val Loss: 0.7883 Acc: 0.7203

Epoch 9/49
----------
train Loss: 0.4224 Acc: 0.8742
val Loss: 0.6430 Acc: 0.7663

Epoch 10/49
----------
train Loss: 0.3987 Acc: 0.8765
val Loss: 0.3886 Acc: 0.8767

Epoch 11/49
----------
train Loss: 0.3523 Acc: 0.8935
val Loss: 0.4227 Acc: 0.8657

Ep

In [17]:
model_filepath = './saved_models/MFCC_CNN_96'


torch.save({
    'model_state_dict': model_ft.state_dict(),
    'optimizer_state_dict': optimizer_ft.state_dict(),
}, model_filepath)
