In [18]:
import os
import random
import pathlib
import time
from array import array

import numpy as np
import pandas as pd
import scipy
import scipy.fft


import librosa
import librosa.display

import IPython.display as ipd

from matplotlib import pyplot as plt
import seaborn as sns

#import sqlite3
import json

import torch as torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from tqdm.notebook import tqdm

from datasets import load_dataset, load_metric

In [10]:
print('torch.cuda.is_available()', torch.cuda.is_available())
print('torch.cuda.device_count()', torch.cuda.device_count())
print('torch.cuda.current_device()', torch.cuda.current_device())
print('torch.cuda.device(0)', torch.cuda.device(0))
print('torch.cuda.get_device_name(0)', torch.cuda.get_device_name(0))
print('torch.cuda.get_device_capability()', torch.cuda.get_device_capability())
print('torch.cuda.is_initialized()', torch.cuda.is_initialized())
#print('torch.cuda.temperature()', torch.cuda.temperature(device=None))

torch.cuda.is_available() True
torch.cuda.device_count() 1
torch.cuda.current_device() 0
torch.cuda.device(0) <torch.cuda.device object at 0x000001BB0F0B4200>
torch.cuda.get_device_name(0) NVIDIA GeForce RTX 4070 Ti
torch.cuda.get_device_capability() (8, 9)
torch.cuda.is_initialized() True


In [11]:
torch.zeros(1).cuda()

tensor([0.], device='cuda:0')

In [12]:
dataset = load_dataset("superb", "ks")

train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [7]:
FRAME_LENGTH = 2048
HOP_LENGTH = 512
TIME_CUT_SIZE = 176 #this is number of frames equal to 4 seconds (1 sec = 44 frames)

In [8]:
class ConvModel(torch.nn.Module):
  def __init__(self):
    super(ConvModel, self).__init__()

    self.conv_1 = torch.nn.Conv2d(kernel_size=(3, 3), in_channels=1, out_channels=3)
    self.maxpool_1 = torch.nn.MaxPool2d(kernel_size=(2, 2))

    self.conv_2 = torch.nn.Conv2d(kernel_size=(3, 3), in_channels=3, out_channels=8)
    self.maxpool_2 = torch.nn.MaxPool2d(kernel_size=(2, 2))

    self.conv_3 = torch.nn.Conv2d(kernel_size=(3, 3), in_channels=8, out_channels=3)
    self.maxpool_3 = torch.nn.MaxPool2d(kernel_size=(2, 2))

    self.relu = torch.nn.ReLU()
    self.flatten = torch.nn.Flatten()

    self.lin_pred = torch.nn.Linear(in_features=7560, out_features=12)    #7560 это что??????
    self.softmax = torch.nn.Softmax(dim=1)

  def forward(self, x):
    out = self.maxpool_1(self.relu(self.conv_1(x)))

    out = self.maxpool_2(self.relu(self.conv_2(out)))

    out = self.maxpool_3(self.relu(self.conv_3(out)))

    out = self.flatten(out)

    return self.softmax(self.lin_pred(out))

In [9]:
class SoundDataset(torch.utils.data.TensorDataset):
    def __init__(self, dataset, frame_length, hop_lenght, time_cut_size):
        self.frame_length = frame_length
        self.hop_lenght = hop_lenght
        self.dataset = dataset
        self.time_cut_size = time_cut_size

    def __getitem__(self, idx):
        wav_array, sr = librosa.load(self.dataset["file"][idx], mono = True)
        stft_wav_array = librosa.stft(wav_array, n_fft=self.frame_length, hop_length=self.hop_lenght)
        power_stft_wav_array = np.abs(stft_wav_array)**2
        log_power_stft_wav_array = librosa.power_to_db(power_stft_wav_array)
        tensor_log_power_stft_wav_array = torch.from_numpy(log_power_stft_wav_array).unsqueeze(0)
        
        res = None
        third_dim_shape = tensor_log_power_stft_wav_array.shape[2]
        
        if third_dim_shape < self.time_cut_size:
            
            res = torch.zeros((tensor_log_power_stft_wav_array.shape[0],
                               tensor_log_power_stft_wav_array.shape[1],
                               self.time_cut_size))
            
            res[:,:,:third_dim_shape] = tensor_log_power_stft_wav_array
            
        else:
            
            res = tensor_log_power_stft_wav_array[:,:,:self.time_cut_size]

        return res, self.dataset["label"][idx]

    def __len__(self):
        return len(self.dataset["file"])
        

In [10]:
train_dataset = SoundDataset(dataset['train'], FRAME_LENGTH, HOP_LENGTH, TIME_CUT_SIZE)
validation_dataset = SoundDataset(dataset['validation'], FRAME_LENGTH, HOP_LENGTH, TIME_CUT_SIZE)
test_dataset = SoundDataset(dataset['test'], FRAME_LENGTH, HOP_LENGTH, TIME_CUT_SIZE)

In [16]:
res, index = train_dataset[5]
res.shape

torch.Size([1, 1025, 176])

In [None]:
epsilon = 10**(-150)
lr = 10**(-5)
batch_size = 4

model = ConvModel() # Model(input_size=1 * 28 * 28, hidden_size=100, output_size=10)
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


for i in tqdm(range(100)):
    train_loss = 0
    train_labels = []
    train_true_labels = []
    for X, target in tqdm(train_dataloader, leave=False):
        X = X.to(device).float()
        target = target.to(device)
        preds = model(X)
        # preds - [4, 10]
        # target - [4,] example: [1, 5, 3, 9]
        loss_value = loss_fn(preds, target)
        prediction_labels = preds.argmax(axis=1)

        train_labels += prediction_labels.detach().cpu().numpy().tolist()
        train_true_labels += target.detach().cpu().numpy().tolist()
        # detach - перестаем считать градиент
        # cpu - перемещаем на ЦПУ
        # numpy - переводим тензор в np.array
        # tolist - переводим np.array в лист

        loss_value.backward()
        optimizer.step()

        train_loss = train_loss + loss_value.item()
    train_loss = train_loss / len(train_dataloader) # len(train_dataloader) - количество батчей
    # accuracy_score = sum([train_labels[i] == train_true_labels[i] for i in range(len(train_labels))]) / len(train_labels)
    accuracy_score_value = accuracy_score(train_true_labels, train_labels)
    print('TRAIN: epoch = ', i, 'loss = ', train_loss, 'accuracy = ', accuracy_score_value)

    with torch.no_grad():
        # без подсчета градиентов!!!
        test_loss = 0
        test_labels = []
        test_true_labels = []
        for X, target in tqdm(test_dataloader, leave=False):
            X = X.to(device).float()
            target = target.to(device)

            preds = model(X)
            loss_value = loss_fn(preds, target)
            prediction_labels = preds.argmax(axis=1)

            test_labels += prediction_labels.detach().cpu().numpy().tolist()
            test_true_labels += target.detach().cpu().numpy().tolist()

            test_loss = test_loss + loss_value.item()
        test_loss = test_loss / len(test_dataloader) # len(test_dataloader) - количество батчей
        # accuracy_score = sum([test_labels[i] == test_true_labels[i] for i in range(len(train_labels))]) / len(train_labels)
        accuracy_score_value = accuracy_score(test_true_labels, test_labels)
        print( 'TEST: epoch = ', i , 'loss = ', test_loss , 'accuracy = ', accuracy_score_value)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/16182 [00:00<?, ?it/s]