In [None]:
import warnings
import os
from IPython.display import Audio, display
import numpy as np
import librosa
from librosa.display import specshow
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fftpack import fft, dct, ifft
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from scipy.special import logsumexp
from collections import defaultdict, namedtuple
import random

import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torchvision.datasets.folder import default_loader
from torch.utils.data import DataLoader

from tensorboardX import SummaryWriter

# warnings.simplefilter('error')
%matplotlib inline


np.random.seed(123)

In [None]:
DATA_PATH = './wav/'
SAMPLE_RATE = 16000
INPUT_SIZE = 39
NUM_LAYERS = 1
NUM_FRAMES = 63000
TRAIN_SPLIT = 0.8
BATCH_SIZE = 32
N_EPOCHS = 1

random.seed(123)

In [None]:
def extract_mfcc(sound, sampling_rate=SAMPLE_RATE, shift=32., L=128., mel_coefs=120, mfcc_coefs=12, alpha=0.9, eps=1e-9):
    mfcc = librosa.feature.mfcc(y=sound, sr=sampling_rate, n_mfcc=mfcc_coefs)
    mfcc_energy = np.vstack((mfcc, (mfcc ** 2).sum(axis=0).reshape(1, -1)))
    dx = librosa.feature.delta(mfcc_energy, order=1, width=3)
    d2x = librosa.feature.delta(mfcc_energy, order=2, width=3)
    res_features = np.vstack((mfcc_energy, dx, d2x)).T
    return res_features.astype(np.float32)


def show(audio):
    return Audio(audio, rate=SAMPLE_RATE)

In [None]:
def get_data(data_path=DATA_PATH):
    train_data = {}
    test_data = {}
    for uid, user_id in enumerate(os.listdir(data_path)):
        videos = os.listdir(os.path.join(data_path, user_id))
        random.shuffle(videos)
        split = round(TRAIN_SPLIT * len(videos))
        train_data[uid] = []
        test_data[uid] = []
        for vid, video_id in enumerate(videos):
            for file in os.listdir(os.path.join(data_path, user_id, video_id)):
                full_path = os.path.join(data_path, user_id, video_id, file)
                record = librosa.load(full_path, sr=SAMPLE_RATE)[0]
                for i in range(0, len(record) - NUM_FRAMES + 1, NUM_FRAMES):
                    x = record[i:i+NUM_FRAMES]
                    if vid < split:
                        train_data[uid].append(extract_mfcc(x))
                    else:
                        test_data[uid].append(extract_mfcc(x))
    return train_data, test_data

In [None]:
class Records(Dataset):
    def __init__(self, records, pos=5, neg=5):
        self.indices = []
        self.target = []
        self.data = records
        for u, data in records.items():
            for i, x in enumerate(data):
                for _ in range(pos):
                    pos_x = i
                    while pos_x == i:
                        pos_x = random.randrange(0, len(data))
                    self.indices.append((u, i, u, pos_x))
                    self.target.append(1.0)
                for _ in range(neg):
                    neg_u = u
                    while neg_u == u or len(records[neg_u]) == 0:
                        neg_u = random.randrange(0, len(records))
                    neg_x = random.randrange(0, len(records[neg_u]))
                    self.indices.append((u, i, neg_u, neg_x))
                    self.target.append(0.0)
    
    def __getitem__(self, index):
        u, x, neg_u, neg_x = self.indices[index]
        return (self.data[u][x], self.data[neg_u][neg_x]), self.target[index]
    
    def __len__(self):
        return len(self.indices)

In [None]:
train_raw, test_raw = get_data()
train_data = Records(train_raw)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

test_data = Records(test_raw)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, input_size=INPUT_SIZE, num_layers=NUM_LAYERS, lstm_units=128):
        super(self.__class__, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=lstm_units, num_layers=num_layers)
        self.sim = nn.Linear(in_features=1, out_features=1)
        
    def forward(self, x, y):
        # x, y -- tensors of size (batch, frames, 39)
        x_h, _= self.lstm(x)
        y_h, _= self.lstm(y)
        
        x_emb = x_h[:, -1]
        y_emb = y_h[:, -1]
        
        sim = self.sim(nn.functional.cosine_similarity(x_emb, y_emb).unsqueeze(1)).squeeze(1)
        return sim

In [None]:
def train(model, optimizer, criterion, train_loader, test_loader, device, n_epochs=N_EPOCHS, log_dir='./logs'):
    writer = SummaryWriter(log_dir)
    train_losses = []
    test_losses = []
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        k = 0
        
        for i, (records, labels) in enumerate(train_loader):
            records_a, records_b = records
            records_a = records_a.to(device)
            records_b = records_b.to(device)
            labels = labels.to(device).float()
            
            outputs = model(records_a, records_b)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            k += 1
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss /= k
        
        model.eval()
        test_loss = 0
        k = 0
        for i, (records, labels) in enumerate(test_loader):
            records_a, records_b = records
            records_a = records_a.to(device)
            records_b = records_b.to(device)
            labels = labels.to(device).float()
            
            with torch.no_grad():
                outputs = model(records_a, records_b)
                loss = criterion(outputs, labels)
            test_loss += loss.item()
        train_loss /= k
        
        writer.add_scalar('simple/train', train_loss, epoch)
        writer.add_scalar('simple/test', test_loss, epoch)

In [None]:
criterion = nn.BCEWithLogitsLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
train(model, optimizer, criterion, train_loader, test_loader, device, n_epochs=1)