In [53]:
import torch, os, sys, json, librosa
import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
from itertools import islice

from torch.utils.data import Dataset, DataLoader, Subset
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import torch.nn as nn


mfcc_transform = T.MFCC(
    sample_rate=22050,
    n_mfcc=39,
    melkwargs={
      'n_fft': 2048,
      'n_mels': 256,
      'hop_length': 512,
      'mel_scale': 'htk',
    }
)
from sklearn.model_selection import train_test_split

In [54]:
data = []
json_folder = '/home/mayank/MTP/begin_again/Error-Driven-ASR-Personalization/MCV_accent/jsons/'
jsons = [f.name for f in os.scandir(json_folder) if 'json' in f.name and f.name.split('.')[0] not in ['unlabelled', 'other']]
counts = [sum(1 for line in open('jsons/'+f)) for f in jsons]
labels = [f.split('.json')[0] for f in jsons]
labels2id = dict([(y,x) for x, y in enumerate(labels)])
id2labels = dict([(x,y) for x, y in enumerate(labels)])
num_classes = len(labels)
print(labels2id)
print(counts)

{'philippines': 0, 'wales': 1, 'scotland': 2, 'hongkong': 3, 'malaysia': 4, 'indian': 5, 'australia': 6, 'us': 7, 'southatlandtic': 8, 'england': 9, 'canada': 10, 'ireland': 11, 'newzealand': 12, 'bermuda': 13, 'singapore': 14, 'african': 15}
[3217, 452, 12580, 1613, 1057, 59797, 31943, 224790, 33, 64328, 34529, 6262, 4080, 248, 2556, 4197]


In [51]:
class accentDataset(Dataset):
    
    def __init__(self, path):
        
        jsons = [f.name for f in os.scandir(json_folder) if 'json' in f.name and f.name.split('.')[0] not in ['unlabelled', 'other']]
        features, labels = [], []
        
        for accent in jsons:
            print("loading", accent)
            json_path = json_folder + accent
            json_file = open(json_path)
#             json_item_list = [line for line in json_file]
            json_item_list = list(islice(json_file, 1000))
            print(len(json_item_list), "samples")
            json_item_list = [json.loads(line.strip()) for line in json_item_list]
            
            for sample in tqdm(json_item_list):
                if librosa.get_duration(filename=sample['audio_filepath']) > 30:
                    continue
                try:
                    waveform, sample_rate = torchaudio.load(sample['audio_filepath'])
                    features.append(mfcc_transform(waveform).mean(2).detach().numpy())
                    labels.append(sample['accent'])
                except Exception as e:
                    print(str(sample['audio_filepath']), e)
                    pass
            print("finished")
        self.X = np.concatenate(features, axis=0)
#         self.Y = np.eye(num_classes, dtype='uint8')[labels]
        self.Y = np.array([labels2id[sample] for sample in labels])
        
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.Y[idx]]


In [None]:
dataset = accentDataset(json_folder)
np.save('dataset.pkl', dataset, allow_pickle =False)

loading philippines.json
1000 samples


100%|██████████| 1000/1000 [09:48<00:00,  1.70it/s]


finished
loading wales.json
452 samples


100%|██████████| 452/452 [04:26<00:00,  1.70it/s]


finished
loading scotland.json
1000 samples


100%|██████████| 1000/1000 [09:47<00:00,  1.70it/s]


finished
loading hongkong.json
1000 samples


100%|██████████| 1000/1000 [09:53<00:00,  1.69it/s]


finished
loading malaysia.json
1000 samples


100%|██████████| 1000/1000 [09:51<00:00,  1.69it/s]


finished
loading indian.json
1000 samples


100%|██████████| 1000/1000 [09:54<00:00,  1.68it/s]


finished
loading australia.json
1000 samples


100%|██████████| 1000/1000 [09:58<00:00,  1.67it/s]


finished
loading us.json
1000 samples


100%|██████████| 1000/1000 [09:59<00:00,  1.67it/s]


finished
loading southatlandtic.json
33 samples


100%|██████████| 33/33 [00:19<00:00,  1.67it/s]


finished
loading england.json
1000 samples


 66%|██████▌   | 655/1000 [06:29<03:19,  1.73it/s]

In [None]:
TEST_SIZE = 0.3
BATCH_SIZE = 64
SEED = 42

dataset = np.load('dataset.pkl')
# generate indices: instead of the actual data we pass in integers instead
train_indices, test_indices, _, _ = train_test_split(
    range(len(dataset)),
    dataset.Y,
    stratify=dataset.Y,
    test_size=TEST_SIZE,
    random_state=SEED
)

# generate subset based on indices
train_split = Subset(dataset, train_indices)
test_split = Subset(dataset, test_indices)

# create batches
train_batches = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
print("dataset type:{}, size:{}".format(type(dataset), len(dataset)))
print("train_split type:{}, size:{}".format(type(train_split), len(train_split)))
print("test_split type:{}, size:{}".format(type(test_split), len(test_split)))

In [None]:
num_classes = len(labels)
feature_size = 39

class classifierHead(nn.Module):

    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(feature_size, feature_size)
        self.dropout = nn.Dropout(0.2)
        self.out_proj = nn.Linear(feature_size, num_classes)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [None]:
learning_rate, epochs = 0.001, 50
model = classifierHead()
print(model)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
print(device)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)

In [None]:
%%time
# keeping-track-of-losses 
train_losses = []
valid_losses = []

for epoch in range(1, epochs + 1):
    # keep-track-of-training-and-validation-loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # training-the-model
    model.train()
    for data, target in train_batches:
        # move-tensors-to-GPU 
        data = data.to(device)
        target = target.to(device)
        
        # clear-the-gradients-of-all-optimized-variables
        optimizer.zero_grad()
        # forward-pass: compute-predicted-outputs-by-passing-inputs-to-the-model
        output = model(data)
        # calculate-the-batch-loss
        loss = criterion(output, target)
        # backward-pass: compute-gradient-of-the-loss-wrt-model-parameters
        loss.backward()
        # perform-a-ingle-optimization-step (parameter-update)
        optimizer.step()
        # update-training-loss
        train_loss += loss.item() * data.size(0)
        
    # validate-the-model
    model.eval()
    for data, target in test_batches:
        
        data = data.to(device)
        target = target.to(device)
        
        output = model(data)
        
        loss = criterion(output, target)
        
        # update-average-validation-loss 
        valid_loss += loss.item() * data.size(0)
    
    # calculate-average-losses
    train_loss = train_loss/len(train_batches.sampler)
    valid_loss = valid_loss/len(test_batches.sampler)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
        
    # print-training/validation-statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))

In [None]:
# test-the-model
model.eval()  # it-disables-dropout
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_batches:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
          
    print('Test Accuracy of the model: {:.2f} %'.format(100 * correct / total))

# Save 
torch.save(model.state_dict(), 'model.ckpt')

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.plot(train_losses, label='Training loss')
plt.plot(valid_losses, label='Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(frameon=False)