### Imports and preprocessing

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

import os
import numpy as np
import pandas as pd
from skimage import feature as skif
import cv2
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from google.colab import files
from google.colab import drive
from zipfile import ZipFile

In [None]:
!pip install onnx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install onnxsim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install onnxruntime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import onnx
import onnxsim

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
!pip install wandb -Uq

In [None]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mabletobetable[0m ([33mlanit-summer2022-antispoofing[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

load dataset in files

In [2]:
DATASET_NAME = 'data256'

drive.mount('/content/gdrive')

with ZipFile(f'/content/gdrive/MyDrive/{DATASET_NAME}.zip', 'r') as dataset_zip:
    dataset_zip.extractall('/content')

Mounted at /content/gdrive


In [None]:
drive.mount('/content/gdrive')
with ZipFile(f'/content/gdrive/MyDrive/features_128.zip', 'r') as dataset_zip:
    dataset_zip.extractall('/content')

with ZipFile(f'/content/gdrive/MyDrive/features_256.zip', 'r') as dataset_zip:
    dataset_zip.extractall('/content')

In [None]:
# in this foulder will save features
if not os.path.exists('/content/features'):
    os.mkdir('/content/features')

# test
feature_128 = np.load('/content/features_128/test_feature.npy', allow_pickle=True)
feature_256 = np.load('/content/features_256/test_feature.npy', allow_pickle=True)
test_features = np.append(feature_128, feature_256)
np.save('features/test_feature.npy', test_features)

# train
feature_128 = np.load('/content/features_128/train_feature.npy', allow_pickle=True)
feature_256 = np.load('/content/features_256/train_feature.npy', allow_pickle=True)
train_features = np.append(feature_128, feature_256)

np.save('features/train_feature.npy', train_features)

preprocess dataset

In [None]:
def lbp_histogram(image,P=8,R=1,method = 'nri_uniform'):
    '''
    image: shape is N*M 
    '''
    lbp = skif.local_binary_pattern(image, P,R, method) # lbp.shape is equal image.shape
    # cv2.imwrite("lbp.png",lbp)
    max_bins = int(lbp.max() + 1) # max_bins is related P
    hist,_= np.histogram(lbp, density=True, bins=max_bins, range=(0, max_bins))
    return hist

def save_features(file_list,file_name):
    feature_label = []
    for line in open(file_list):
        image_path = line.strip().split(' ')[0]
        label = int(line.strip().split(' ')[1])
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)
        y_h = lbp_histogram(image[:,:,0]) # y channel
        cb_h = lbp_histogram(image[:,:,1]) # cb channel
        cr_h = lbp_histogram(image[:,:,2]) # cr channel
        feature = np.concatenate((y_h,cb_h,cr_h))
        feature_label.append(np.append(feature,np.array(label)))
    np.save(file_name,np.array(feature_label, dtype=object)) #add 'dtype=object'

def create_partition_and_labels(data1, data2):
    """
    calculate lbp features and then
    creates 3 dictionary like follow:

    >>> partition
    {'train': ['id-1', 'id-2', 'id-3'], 'validation': ['id-4']}

    >>> ID_vectors
    {'id-1': [...], 'id-2': [...], 'id-3': [...], 'id-4': [...]}

    >>> ID_labels
    {'id-1': 0, 'id-2': 1, 'id-3': 2, 'id-4': 1}
    """
    train_list = []
    test_list = []

    # test data1
    for folder in os.listdir(f'/content/{data1}/test'):
        if os.path.exists(f'/content/{data1}/test/'+folder+'/live'):
            for image in os.listdir(f'/content/{data1}/test/'+folder+'/live'):
                test_list.append(f'\n/content/{data1}/test/'+folder+'/live/'+image+' 0')
        if os.path.exists(f'/content/{data1}/test/'+folder+'/spoof'):
            for image in os.listdir(f'/content/{data1}/test/'+folder+'/spoof'):
                test_list.append(f'\n/content/{data1}/test/'+folder+'/spoof/'+image+' 1')
    
    # train data1
    for folder in os.listdir(f'/content/{data1}/train'):
        if os.path.exists(f'/content/{data1}/train/'+folder+'/live'):
            for image in os.listdir(f'/content/{data1}/train/'+folder+'/live'):
                train_list.append(f'\n/content/{data1}/train/'+folder+'/live/'+image+' 0')
        if os.path.exists(f'/content/{data1}/train/'+folder+'/spoof'):
            for image in os.listdir(f'/content/{data1}/train/'+folder+'/spoof'):
                train_list.append(f'\n/content/{data1}/train/'+folder+'/spoof/'+image+' 1')


    # test data2
    for folder in os.listdir(f'/content/{data2}/test'):
        if os.path.exists(f'/content/{data2}/test/'+folder+'/live'):
            for image in os.listdir(f'/content/{data2}/test/'+folder+'/live'):
                test_list.append(f'\n/content/{data2}/test/'+folder+'/live/'+image+' 0')
        if os.path.exists(f'/content/{data2}/test/'+folder+'/spoof'):
            for image in os.listdir(f'/content/{data2}/test/'+folder+'/spoof'):
                test_list.append(f'\n/content/{data2}/test/'+folder+'/spoof/'+image+' 1')
    
    # train data2
    for folder in os.listdir(f'/content/{data2}/train'):
        if os.path.exists(f'/content/{data2}/train/'+folder+'/live'):
            for image in os.listdir(f'/content/{data2}/train/'+folder+'/live'):
                train_list.append(f'\n/content/{data2}/train/'+folder+'/live/'+image+' 0')
        if os.path.exists(f'/content/{data2}/train/'+folder+'/spoof'):
            for image in os.listdir(f'/content/{data2}/train/'+folder+'/spoof'):
                train_list.append(f'\n/content/{data2}/train/'+folder+'/spoof/'+image+' 1')

    ### GET FEATURES  ###

    # remove simbol '\n' from first string
    test_list[0] = test_list[0][1::]
    train_list[0] = train_list[0][1::]

    # create files if they dont exists .txt
    with open("/content/test_file_list.txt", "w+") as textfile:
        for line in test_list:
            textfile.write(line)

    with open("/content/train_file_list.txt", "w+") as textfile:
        for line in train_list:
            textfile.write(line)

    # in this foulder will save features
    if not os.path.exists('/content/features'):
        os.mkdir('/content/features')

    # getting features
    # save_features("test_file_list.txt","features/test_feature.npy")
    # save_features("train_file_list.txt","features/train_feature.npy")

    # load features (потом папку заменить просто на features)
    test_npy = np.load('/content/features/test_feature.npy', allow_pickle=True)
    train_npy = np.load('/content/features/train_feature.npy', allow_pickle=True)

    # convert to pandas to preprocess data
    test_features = pd.DataFrame(index=range(test_npy.shape[0]), columns=range(test_npy[0].shape[0]))
    train_features = pd.DataFrame(index=range(train_npy.shape[0]), columns=range(train_npy[0].shape[0]))

    # filter incorrect features
    for i,line in enumerate(test_npy):
        if line.shape == (178,):
            test_features.loc[i] = line
    for i,line in enumerate(train_npy):
        if line.shape == (178,):
            train_features.loc[i] = line

    # drop nan values
    test_features.dropna(inplace=True)
    train_features.dropna(inplace=True)

    # features labels split
    test, test_labels = test_features.loc[:, :176], test_features.loc[:, 177].tolist()
    train, train_labels = train_features.loc[:, :176], train_features.loc[:, 177].tolist()

    # conver to list
    test = test.to_numpy().tolist()
    train = train.to_numpy().tolist()
    
    # train <-> valid split
    train, valid, train_labels, valid_labels = train_test_split(train, train_labels, 
                                                                train_size = 0.8, random_state=44)

    # train test split
    partition = {'test': np.arange(len(test)), 
                 'train': np.arange(len(test), len(test + train)), 
                 'valid': np.arange(len(test + train), len(test + train + valid))}

    # ID <-> image vector(feature)
    ID_list = np.arange(len(test + train + valid))
    image_vectors = test + train + valid
    ID_vectors = dict(zip(ID_list, image_vectors))

    # ID <-> image label
    labels_list = test_labels + train_labels + valid_labels
    ID_labels = dict(zip(ID_list, labels_list))

    return partition, ID_vectors, ID_labels

class MyDataset(torch.utils.data.Dataset):
  def __init__(self, partition, id_vectors, id_labels):
        self.partition = partition
        self.id_vectors = id_vectors
        self.id_labels = id_labels

  def __len__(self):
        return len(self.partition)

  def __getitem__(self, index):
        id = self.partition[index]
        item_vector= self.id_vectors[id]

        X = item_vector
        y = self.id_labels[id]

        return torch.Tensor(X), torch.tensor(y, dtype=torch.long)

#### Create datasets

In [None]:
partition, id_vectors, id_labels = create_partition_and_labels(data1 = 'data128', data2 = 'data256')

training_set = MyDataset(partition['train'], id_vectors, id_labels)
validation_set = MyDataset(partition['valid'], id_vectors, id_labels)
testing_set = MyDataset(partition['test'], id_vectors, id_labels)

### Functions to train, test and log

In [None]:
def trainer(model, train_loader, valid_loader, loss_function, optimizer, scheduler, config):
    """
    (count_of_epoch, batch_size, dataset, model, loss_function, optimizer, lr = 0.001)
    trainer итерируется по кол-ву эпох и вызывает функцию train_epoch
    count_of_epoch - кол-во эпох
    batch_size - размер батча
    dataset - данные для обучения
    model - модель нейронной сети
    loss_function - функция потерь
    optimizer - оптимизатор
    lr - скорость обучения, по умолчанию 0.001
    """
    min_valid_loss = np.inf

    # # in this foulder will save model weights
    if not os.path.exists('/content/model_weights'):
        os.mkdir('/content/model_weights')

    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, loss_function, log="all", log_freq=10)
    
    for e in range(config.count_of_epoch):
        # train
        epoch_loss = train_epoch(train_generator=train_loader, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optimizer)

        # valid
        valid_loss = 0.0
        model.eval()
        valid_loss = train_epoch(train_generator=valid_loader, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optimizer)
        
        scheduler.step()

        # log things
        trainer_log(epoch_loss, valid_loss, e, scheduler.get_last_lr()[0], min_valid_loss)

        # saving models
        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            torch.save(model.state_dict(), f'/content/model_weights/saved_model_{e}.pth')
            wandb.log_artifact(f'/content/model_weights/saved_model_{e}.pth', 
                               name=f'saved_model_{e}', type='model')
        print()

def train_epoch(train_generator, model, loss_function, optimizer):
    """
    внутри train_epoch итерируемся по батчам внутри батчгенератора
    train_generator - батчгенератора
    model - модель нейронной сети
    loss_function - функция потерь
    optimizer - оптимизатор
    """
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    """
    в train_on_batch обучаемся на одном батче
    model - модель нейронной сети
    x_batch - фичи
    y_batch - таргеты(метки классов)
    optimizer - оптимизатор
    loss_function - функция потерь
    """
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(device))
    
    loss = loss_function(output, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [None]:
def tester(run, model, test_loader):
    pred = []
    real = [] 
    model.eval()
    for it, (x_batch, y_batch) in enumerate(test_loader):
        x_batch = x_batch.to(device)
        with torch.no_grad():
            output = model(x_batch)

        pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
        real.extend(y_batch.cpu().numpy().tolist())

    wandb.log({"test_accuracy": accuracy_score(real, pred)})

    print(classification_report(real, pred, zero_division = 0))

    # Save the model in the exchangeable ONNX format
    f_pth = f'/content/model_weights/saved_model_latest.pth'
    f_onnx = f_pth.replace('.pth', '.onnx')
    torch.save(model.state_dict(), f_pth)
    torch.onnx.export(model, 
                      x_batch,
                      f_onnx, 
                      opset_version=11, 
                      input_names=['input'],
                      output_names=['output'],
                      training= torch.onnx.TrainingMode.EVAL,
                      do_constant_folding=False, 
                      dynamic_axes=None)
    
    model_onnx = onnx.load(f_onnx)  # load onnx model
    onnx.checker.check_model(model_onnx) 
    print(model_onnx.ir_version)

    model_onnx, check = onnxsim.simplify(
                        model_onnx,
                        dynamic_input_shape=True,
                        input_shapes={'input': list(x_batch.shape)} )

    onnx.save(model_onnx, f_onnx)

    wandb.save('/content/model_weights/saved_model_latest.onnx')

In [None]:
def trainer_log(train_loss, valid_loss, epoch, lr, min_val_loss):
    wandb.log({'train_loss': train_loss, 'valid_loss': valid_loss,
               'epoch': epoch, 'learning_rate': lr,
               'min_validation_loss': min_val_loss})
    print(f'train loss on {str(epoch).zfill(3)} epoch: {train_loss:.6f} with lr: {lr:.6f}')
    print(f'valid loss on {str(epoch).zfill(3)} epoch: {valid_loss:.6f}')

def make_loader(dataset, batch_size):
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                         batch_size=batch_size, 
                                         shuffle=True,
                                         pin_memory=True, num_workers=2)
    return loader

def download_folder_in_zip(dir_to_zip, output_filename, delete_dir_after_download=False):
    os.system( "zip -r {} {}".format(output_filename, dir_to_zip))
    if delete_dir_after_download:
        os.system( "rm -r {}".format(dir_to_zip))
    files.download(output_filename)

### Model pipeline

In [None]:
def pipeline(hyperparameters, saved_model=None, to_train=True, to_test=True):

    with wandb.init(project=hyperparameters['project'], config=hyperparameters) as run:
      config = wandb.config

      # build the model
      model = build_model(run, config, saved_model)

      # make the data and optimization 
      train_loader, valid_loader, test_loader, criterion, optimizer, scheduler = make(model, config)

      print('config:', '\n', config, '\n', model, '\n', 'running on device:', device, '\n')

      if to_train:
        trainer(model, train_loader, valid_loader, criterion, optimizer, scheduler, config)

      if to_test:
        tester(run, model, test_loader)
    return model

def build_model(run, config, saved_model=None):
    IN, H1, H2, H3, H4, H5, OUT = 177, 512, 256, 128, 64, 32, 2
    p = config.dropout

    model =  nn.Sequential(
    nn.Linear(IN, H1), nn.Dropout(p), nn.BatchNorm1d(H1), nn.ReLU(),        
    nn.Linear(H1, H2), nn.Dropout(p), nn.BatchNorm1d(H2), nn.ReLU(), 
    nn.Linear(H2, H3), nn.Dropout(p), nn.BatchNorm1d(H3), nn.ReLU(), 
    nn.Linear(H3, H4), nn.Dropout(p), nn.BatchNorm1d(H4), nn.ReLU(), 
    nn.Linear(H4, H5), nn.Dropout(p), nn.BatchNorm1d(H5), nn.ReLU(), 
    nn.Linear(H5, OUT), nn.Dropout(p), nn.BatchNorm1d(OUT), nn.ReLU()) 

    if saved_model == None:
        # just init with some weights
        def init_weights(m):
            if type(m) == nn.Linear:
                torch.nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        model.apply(init_weights)

    else:
        # download weights of saved model 
        artifact = run.use_artifact(f'lanit-summer2022-antispoofing/{config.project}/{saved_model[0]}:v{saved_model[1]}', type='model')
        artifact_dir = artifact.download()
        model_path = os.path.join(artifact_dir, f'{saved_model[0]}.pth')
        #
        model_path = '/content/model_weights/saved_model_latest.onnx'
        #
        model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))

    model = model.to(device)

    return model

def make(model, config):
    train, valid, test = training_set, validation_set, testing_set
    train_loader = make_loader(train, batch_size=config.batch_size)
    valid_loader = make_loader(valid, batch_size=config.batch_size)
    test_loader = make_loader(test, batch_size=config.batch_size)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config.lr)
    scheduler = StepLR(optimizer, config.step_size, config.step_gamma)
    
    return train_loader, valid_loader, test_loader, criterion, optimizer, scheduler

### Running

In [None]:
config = dict(count_of_epoch=1, batch_size=1024, lr=9, 
              dropout=0.1, critirion='CrossEntropyLoss', 
              optimizer='SGD', scheduler='StepLR', 
              step_size = 5, step_gamma = 0.1,
              project='experiments', name_of_model='mlp', data='128+256')

# to train and test model
model = pipeline(config, to_train=True, to_test=True)

config: 
 {'count_of_epoch': 1, 'batch_size': 1024, 'lr': 9, 'dropout': 0.1, 'critirion': 'CrossEntropyLoss', 'optimizer': 'SGD', 'scheduler': 'StepLR', 'step_size': 5, 'step_gamma': 0.1, 'project': 'experiments', 'name_of_model': 'mlp', 'data': '128+256'} 
 Sequential(
  (0): Linear(in_features=177, out_features=512, bias=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=256, bias=True)
  (5): Dropout(p=0.1, inplace=False)
  (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): ReLU()
  (8): Linear(in_features=256, out_features=128, bias=True)
  (9): Dropout(p=0.1, inplace=False)
  (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): ReLU()
  (12): Linear(in_features=128, out_features=64, bias=True)
  (13): Dropout(p=0.1, inplace=False)
  (14): BatchNorm1d(64, eps=1e-0



VBox(children=(Label(value='1.041 MB of 2.073 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.502095…

0,1
epoch,▁
learning_rate,▁
test_accuracy,▁
train_loss,▁
valid_loss,▁

0,1
epoch,0.0
learning_rate,9.0
min_validation_loss,inf
test_accuracy,0.75549
train_loss,0.53152
valid_loss,0.49874
