This notebook is part of Andreu's (esdandreu@gmail.com) Master Thesis work at
Keio University.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AcousticOdometry/AO/blob/main/notebooks/models.ipynb)


# Setup

This section will take care of installing the necessary packages as well as
configuring some environment variables.


## Colab

Assess wether the notebook is being executed in [Google
Colab](https://colab.research.google.com/) and if so, set up the software
needed in Colab runtime.


In [9]:
%%capture
try:
    from google import colab
    COLAB_RUNTIME = True
    %pip install torchinfo
    colab.drive.mount('/content/drive')
except ImportError:
    COLAB_RUNTIME = False

## Packages


In [10]:
import logging
import numpy as np

from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
from typing import Dict, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchinfo import summary
from torchvision import datasets, transforms

### AO

Setup Acoustic Odometry python package. If this notebook is being executed in
[Colab](#colab), the package will be installed from Github. Because of this, a
Github [personal access
token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
is needed for the installation.

If the notebook is not running on Colab and the package is not already
installed, installation instructions will be prompted.


In [11]:
if COLAB_RUNTIME:
    import subprocess
    import requests
    import sys
    import os
    #@markdown Use a [GitHub Personal Access Token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
    GITHUB_TOKEN = ''  #@param {type:"string"}
    auth = requests.auth.HTTPBasicAuth('', GITHUB_TOKEN)
    response = requests.get(
        "https://api.github.com/repos/AcousticOdometry/AO/releases/latest",
        auth=auth
        )
    try:
        response.raise_for_status()
    except requests.HTTPError as e:
        raise RuntimeError(
            'Check GITHUB_TOKEN is a Personal Access Token with repo access'
            )
    headers = {'Accept': 'application/octet-stream'}
    for asset in response.json()['assets']:
        r = requests.get(
            asset['url'], auth=auth, allow_redirects=True, headers=headers
            )
        r.raise_for_status()
        wheel_name = asset['name']
        with open(wheel_name, 'wb') as f:
            f.write(r.content)
        try:
            result = subprocess.check_output([
                sys.executable, '-m', 'pip', 'install', wheel_name
                ])
            print(f'Installed {wheel_name}')
            break
        except subprocess.CalledProcessError as e:
            pass
        finally:
            os.remove(wheel_name)
    import ao
else:
    try:
        import ao
    except ImportError:
        raise ImportError(
            "Acoustic Odometry python extension is not installed. Check "
            r"https://github.com/AcousticOdometry/AO#readme"
            " for detailed instructions."
            )

## Configure logging

In [12]:
logging_folder = Path().parent / 'logs'
logging_folder.mkdir(exist_ok=True)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
logging.basicConfig(
    format="[%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            logging_folder / datetime.now().strftime('%Y%m%d_%H%M%S.log')
            ),
        stream_handler,
        ],
    level=logging.DEBUG
    )


# Models

## Model definition

In [13]:
from abc import abstractmethod

class AcousticOdometryModel(nn.Module):

    @abstractmethod
    def forward(self, x):
        pass

# Reset the subclasses to allow changes without restarting the kernel
for subclass in AcousticOdometryModel.__subclasses__():
    del subclass

In [14]:
class CNNet(AcousticOdometryModel):
    def __init__(self, classes: int):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=5)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(210816, 512)
        self.fc2 = nn.Linear(512, classes)


    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        #x = x.view(x.size(0), -1)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        return F.log_softmax(x,dim=1)  

In [15]:
AcousticOdometryModel.__subclasses__()

[__main__.CNNet]

## Load data

In [16]:
if COLAB_RUNTIME:
    # @markdown Check where is the experiment folder situated in your drive folder.
    # @markdown Remember that if you have been shared the folder, you can
    # @markdown [add a shortcut to your drive](https://support.google.com/drive/answer/9700156?hl=en&co=GENIE.Platform%3DDesktop)
    # @markdown in order to make it available in google colab.
    experiment = "/content/drive/MyDrive/VAO_WheelTestBed-Experiment-1"  #@param {type:"string"}
    EXPERIMENT_FOLDER = Path(experiment)
    if not EXPERIMENT_FOLDER.is_dir():
        raise RuntimeError(f'Invalid experiment folder {EXPERIMENT_FOLDER}')
else:
    EXPERIMENT_FOLDER = ao.dataset.utils.get_folder(
        env='WHEELTESTBED_EXPERIMENT1'
        )
DATASETS_FOLDER = EXPERIMENT_FOLDER / 'datasets'
MODELS_FOLDER = EXPERIMENT_FOLDER / 'models'
MODELS_FOLDER.mkdir(exist_ok=True)

In [17]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
logging.info('Using {} device'.format(DEVICE))

[INFO] Using cuda device


In [18]:
def load_dataset(
    dataset_folder: Path,
    train_split: float = 0.8,
    batch_size: int = 15,
    ) -> Tuple[Dict[str, torch.utils.data.DataLoader], Dict[str, int]]:
    dataset = datasets.DatasetFolder(
        root=dataset_folder,
        loader=np.load,
        extensions=['.npy'],
        transform=transforms.Compose([
            transforms.ToTensor()
            ])
        )
    train_size = int(train_split * len(dataset))
    test_size = len(dataset) - train_size
    loaders = {}
    for split, name in zip(
        torch.utils.data.random_split(dataset, [train_size, test_size]),
        ['train', 'test']
        ):
        loaders[name] = torch.utils.data.DataLoader(
            split,
            batch_size=batch_size,
            num_workers=2 if DEVICE == 'cuda' else 1,
            shuffle=True,
            pin_memory=True if DEVICE == 'cuda' else False,
            )
        logging.info(
            f'{name} set: {len(split)} samples, {len(loaders[name])} batches'
            )
    return loaders, dataset.class_to_idx


## Train the model

In [19]:
def train_epoch(
    dataloader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    cost_function: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    ):
    model.train()
    size = len(dataloader.dataset)
    # TODO use tqdm
    for batch_n, (X, Y) in enumerate(dataloader):
        X, Y = X.to(DEVICE), Y.to(DEVICE)
        optimizer.zero_grad()
        prediction = model(X)
        loss = cost_function(prediction, Y)
        loss.backward()
        optimizer.step()

        if batch_n % 100 == 0:
            loss, current = loss.item(), batch_n * len(X)
            logging.debug(f'loss: {loss:>7f}  [{current:>5d}/{size:>5d}]')


def test(
    dataloader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    cost_function: torch.nn.modules.loss._Loss,
    ):
    model.eval()
    size = len(dataloader.dataset)

    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, Y in dataloader:
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)

            test_loss += cost_function(pred, Y).item()
            correct += (pred.argmax(1) == Y).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    logging.info(f'acc: {(100*correct):>0.1f}%, avg loss: {test_loss:>8f}')


In [20]:
def train(
    dataset_folder: Path,
    model_class: AcousticOdometryModel,
    epochs: int = 15,
    cost_function: torch.nn.modules.loss._Loss = torch.nn.CrossEntropyLoss(),
    optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
    optimizer_options: dict = {'lr': 0.001},
    ):
    loaders, class_to_idx = load_dataset(dataset_folder)
    # ! A bit sketchy, in the future the classes should be fixed
    model = model_class(classes=len(class_to_idx)).to(DEVICE)
    optimizer = optimizer_class(model.parameters(), **optimizer_options)
    for t in range(epochs):
        logging.info(f'Epoch {t}')
        train_epoch(loaders['train'], model, cost_function, optimizer)
        test(loaders['test'], model, cost_function)
    return model

In [22]:
dataset_name = 'numpy-arrays'
model = train(
    dataset_folder=DATASETS_FOLDER / dataset_name,
    model_class=CNNet,
    epochs=15,
    )
torch.save(
    model, MODELS_FOLDER / (
        f"name_{dataset_name.replace('_','-')};" +
        f"{datetime.now().strftime('date_%Y-%m-%d;time_%H-%M-%S')}.pt"
        )
    )
summary(model, input_size=(15, 1, 256, 120))

FileNotFoundError: Couldn't find any class folder in G:\Shared drives\VAO [Andreu's Thesis]\VAO_WheelTestBed-Experiment-1\datasets\numpy-arrays.