In [1]:
import collections
import numpy as np
import pickle
import time
import librosa
import os
import torch
import torch.nn as nn
from torch.cuda.amp.autocast_mode import autocast
from torch.cuda.amp.grad_scaler import GradScaler
from PIL import Image

  example_input = torch.tensor([[-3, -2, -1], [0, 1, 2]])


In [2]:
dpack = 'xgroup4'

with open(dpack, 'rb') as f:
    audio_data = pickle.load(f)

counts = collections.Counter([x[1] for x in audio_data])
total = np.sum([x for x in counts.values()])
audio_data = [(torch.tensor([(np.float32(x[0]) - np.mean(x[0])) / 80], dtype=torch.float32), torch.tensor(x[1], dtype=torch.int64)) for x in audio_data]
lblwgts = [counts[x] / total for x in range(19)]
lblwgts = [((((1/19) / x) + 1/19) / 2) + 0.3678 for x in lblwgts]

In [2]:

class ReStep(nn.Module):

    def __init__(self, ftrs, up_ftrs):
        super(ReStep, self).__init__()
        self.icnv = nn.Conv2d(ftrs, ftrs, 3, 1, 1)
        self.idcnv = nn.Conv2d(ftrs, ftrs, 1)
        self.irescal = nn.Conv2d(ftrs, ftrs, 1, 2)
        self.iupscal = nn.Conv2d(ftrs, up_ftrs, 3, 1, 1)
        self.inrm = nn.BatchNorm2d(ftrs)
        self.imxpool = nn.MaxPool2d(2, 2)
        self.iact = nn.ELU()
        self.drop = nn.Dropout(0.16)

    def forward(self, x):
        res = x
        x = self.iact(self.inrm(self.icnv(x)))
        x = self.iact(self.inrm(self.icnv(x)))
        x = self.imxpool(x)
        #x = self.drop(x)
        x = self.iact(torch.add(x, self.inrm(self.irescal(res))))
        res = x
        x = self.iact(self.icnv(x))
        x = self.iact(self.idcnv(x))
        x = self.iupscal(self.inrm(torch.add(x, res)))
        return x

class FeatureExtractor(nn.Module):

    def __init__(self, step_1_ftrs: int, step_2_ftrs: int, step_3_ftrs: int, step_4_ftrs: int):
        torch.manual_seed(1024)
        torch.set_default_tensor_type(torch.cuda.FloatTensor)
        super(FeatureExtractor, self).__init__()
        self.final_ftrs = step_4_ftrs
        self.ecnv1 = nn.Conv2d(1, step_1_ftrs, 3, 1, 1)
        self.ecnv2 = nn.Conv2d(step_1_ftrs, step_1_ftrs, 3, 1, 1)
        self.fcnv = nn.Conv2d(self.final_ftrs, self.final_ftrs, 3, 1, 1)
        self.act = nn.ELU()
        self.nrm = nn.BatchNorm2d(self.final_ftrs)
        self.fmxpool = nn.AdaptiveAvgPool2d(4)
        self.drop = nn.Dropout(0.32)
        self.flin = nn.Linear(self.final_ftrs * 16, self.final_ftrs * 2)
        self.flin2 = nn.Linear(self.final_ftrs * 2, 19)
        self.resloop = [ReStep(step_1_ftrs, step_2_ftrs), ReStep(step_2_ftrs, step_3_ftrs), ReStep(step_3_ftrs, step_4_ftrs), ReStep(step_4_ftrs, step_4_ftrs)]

    def forward(self, x):
        x = self.act(self.ecnv1(x))
        x = self.act(self.ecnv2(x))
        for loop in self.resloop:
            x = loop(x)
        x = self.act(self.nrm(self.fcnv(x)))
        x = self.fmxpool(x)
        x = self.drop(x)
        x = x.view(-1, self.final_ftrs * 16)
        x = self.flin2(self.act(self.flin(x)))
        return x

class Trainer:

    def __init__(self, model: nn.Module, optimizer: torch.optim, loss_function: nn.CrossEntropyLoss, learning_rate: float, weight_decay: float, label_weights):
        self.gpu = torch.device('cuda')
        self.model = model.cuda()
        self.optimizer = optimizer(av_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        #self.loss_function = loss_function(weight=torch.tensor(label_weights, dtype=torch.float32))
        self.loss_function = loss_function()
        self.scaler = GradScaler(enabled=True)

    def _split_data(self, adata, ratio, splits):
        np.random.shuffle(adata)
        vdata = []
        for x in counts.items():
            gcounter = 0
            i = 0
            while gcounter < int(x[1] * ratio):
                if adata[i][1] != x[0]:
                    vdata.append(adata.pop(i))
                    gcounter += 1
                i += 1
        adata = [adata[(x - 1) * int(len(adata) / splits) : x * int(len(adata) / splits)] for x in range(1, splits + 1)]
        adata = [[x, vdata] for x in adata]
        return adata

    def _train(self, dataset):
        total_loss = 0
        for x in dataset:
            with autocast(enabled=True):
                data, labels = x[0].to(self.gpu), x[1].to(self.gpu)
                results = self.model(data)
                loss = self.loss_function(results, labels)
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()
            total_loss += loss.item()
            self.optimizer.zero_grad()
        return total_loss / len(dataset)

    def _validate(self, dataset):
        total_loss = 0
        accuracy = 0
        with torch.no_grad():
            for x in dataset:
                correct = 0
                data, labels = x[0].to(self.gpu), x[1].to(self.gpu)
                results = self.model(data)
                for i, x in enumerate([x.argmax() for x in results]):
                    if x == labels[i]:
                        correct += 1
                accuracy += correct / 64
                total_loss += self.loss_function(results, labels).item()
        return total_loss / len(dataset), accuracy / len(dataset)

    def train_model(self, dataset, target_epochs, ratio, model_name, val, splits):
        self.cycle = round(time.time())
        best_loss = 1.35
        samples = len(dataset)
        dataset = self._split_data(dataset, ratio, splits)
        for _ in range(target_epochs):
            for i, group in enumerate(dataset):
                self.model.train(True)
                np.random.shuffle(group[0])
                tdata = iter(torch.utils.data.DataLoader(group[0], batch_size=64))
                train_loss = self._train(tdata)
                if not val:
                    print(f'Status at batch {int((i+1) * int(samples * (1 - ratio)) / splits / 64)},\tTLoss: {round(float(train_loss), 5)} TAccuracy {round(2.7182818**-float(train_loss) * 100, 5)}%')
                if val:
                    self.model.train(False)
                    np.random.shuffle(group[1])
                    vdata = iter(torch.utils.data.DataLoader(group[1], batch_size=64))
                    val_loss, val_acc = self._validate(vdata)
                    print(f'Status at batch {int((i+1) * int(samples * (1 - ratio)) / splits / 64)},\tTLoss: {round(float(train_loss), 5)}\tTAccuracy {round(2.7182818**-float(train_loss) * 100, 5)}%\tVLoss: {round(float(val_loss), 5)}\tVAccuracy {round(float(val_acc) * 100, 5)}%')
                    if val_loss < best_loss:
                        best_loss = val_loss
                        torch.save(self.model.state_dict(), f'models/{model_name}')
        self.model = self.model.cpu()
        del dataset
        torch.cuda.empty_cache()


In [3]:
av_model = FeatureExtractor(32, 64, 128, 256)
av_model.load_state_dict(torch.load('models/mscls_03', map_location="cuda"))
#print(np.sum(sum(x.numel() for x in av_model.parameters() if x.requires_grad)))


#trainer = Trainer(av_model, torch.optim.NAdam, nn.CrossEntropyLoss, 3e-4, 1e-5, lblwgts)

<All keys matched successfully>

In [5]:
trainer.train_model(audio_data, 3, 0.12, 'mscls_034', True, 2)

Status at batch 466,	TLoss: 1.10919	TAccuracy 32.98271%	VLoss: 1.00775	VAccuracy 65.73486%
Status at batch 933,	TLoss: 1.09787	TAccuracy 33.35821%	VLoss: 1.0034	VAccuracy 65.42969%
Status at batch 466,	TLoss: 0.98737	TAccuracy 37.25546%	VLoss: 0.99709	VAccuracy 66.28418%
Status at batch 933,	TLoss: 0.98702	TAccuracy 37.26867%	VLoss: 1.00539	VAccuracy 66.18652%
Status at batch 466,	TLoss: 0.90375	TAccuracy 40.50479%	VLoss: 0.99288	VAccuracy 66.57715%
Status at batch 933,	TLoss: 0.91596	TAccuracy 40.01337%	VLoss: 1.00781	VAccuracy 65.55176%


In [8]:
#torch.save(av_model.state_dict(), 'models/mscls_033')

In [4]:

def predict(filename: str, model: nn.Module, fmax, samples=4):
    with open(f'{filename}', 'rb') as f:
        sraw, sr = librosa.load(f, sr=None)
    scale = sr / 22050
    model.cuda()
    msgram = librosa.power_to_db(librosa.feature.melspectrogram(S=np.abs(librosa.stft(sraw, n_fft=int(2048 * scale)))**2, sr=sr, fmax=fmax), ref=np.max)
    portions = []
    if msgram.shape[1] < 128:
        msgram = np.float32(Image.fromarray(np.uint8(msgram)).resize((128, 128)))
    for i in range(round(msgram.shape[1] * 0.1),  round(msgram.shape[1] *0.9) - 5, round((msgram.shape[1] * 0.8) / samples)):
        portions.append((np.uint8(np.abs(msgram[:, i:i+128])) * 2))
    portions = [torch.tensor([[(x - np.mean(x)) / 80]], dtype=torch.float32) for x in portions]
    results = []
    with torch.no_grad():
        for x in portions:
            x.to('cuda:0')
            results.append(torch.nn.functional.softmax(*model(x), dim=0).tolist())
    model.cpu()
    results = [x for x in results if np.max(x) > np.max(results) * 0.666]
    scores = [round(sum(x), 4) for x in  np.array(results).T]
    guess = scores.index(np.max(scores))
    return guess

def group_predict(adata, model):
    predictions = []
    with torch.no_grad():
        for i, dset in enumerate(adata):
            portions = [torch.tensor([[(np.float32(y) - np.mean(y)) / 80]], dtype=torch.float32) for y in dset[0]]
            results = []
            for z in portions:
                z.to('cuda:0')
                results.append(torch.nn.functional.softmax(*model(z), dim=0).tolist())
            #results = [w for w in results if np.max(w) > np.max(results) * 0.3678]
            scores = [round(sum(w), 4) for w in  np.array(results).T]
            predictions.append((dset[1], scores.index(np.max(scores))))
            if i % 512 == 0 and i != 0:
                print(i)
    model.cpu()
    return predictions


In [None]:
predictions = [predict(f'testing/{x}', av_model, 4096, 5) for x in list(os.listdir(r'C:\Users\BBA\Coding\Audio\Classification\testing'))]
real_l = [14, 14, 3, 3, 3, 6, 6, 7, 7, 0, 0, 0, 10, 10, 10, 2, 2, 2, 8, 8, 8, 5, 5, 5, 4, 4, 4, 1, 1, 1, 9, 9, 12, 12, 15, 15, 16, 16, 17, 17, 13, 13, 11, 11, 18, 18]
real_g = 0
for i, x in enumerate(predictions):
    if x == real_l[i]:
        real_g += 1
    print(real_l[i], x)
print(real_g)

In [5]:
dpack = 'tgroup1'
with open(dpack, 'rb') as f:
    audio_data = pickle.load(f)

predictions = group_predict(audio_data, av_model)

512
1024
1536
2048
2560
3072
3584
4096
4608


In [None]:
real_l = [14, 14, 3, 3, 3, 6, 6, 7, 7, 0, 0, 0, 10, 10, 10, 2, 2, 2, 8, 8, 8, 5, 5, 5, 4, 4, 4, 1, 1, 1, 9, 9, 12, 12, 15, 15, 16, 16, 17, 17, 13, 13, 11, 11, 18, 18]
score = 0
for i, x in enumerate(predictions):
    print(x, real_l[i])
    if x[1] == real_l[i]:
        score += 1
print(score)

In [6]:
with open('bba_preds.csv', 'wt') as f:
    f.writelines([f'{x[0].lstrip("0").rstrip(".ogg")},{x[1]}\n' for x in predictions])