In [1]:
import os
import json

## Obteniendo los índices de segmentos válidos por audio

In [2]:
def is_valid_segment(line):
    segment = json.loads(line)
    return len(segment['speakers']) == 1 \
            and segment['speakers'][0]['speaker_id'] in ['A', 'B'] \
            and len(segment['ivectors']) == 1 \
            and len(segment['xvectors']) == 1

directory = 'exp/json'
filenames = [filename for filename in os.listdir(directory) if os.path.isfile(os.path.join(directory, filename))]
filenames.sort()
recordings = {}
for filename in filenames:
    filepath = os.path.join(directory, filename)
    file = open(filepath)
    indexes = [index for index, line in enumerate(file.readlines()) if is_valid_segment(line)]
    file.close()
    recording_id = filename.split('.')[0]
    recordings[recording_id] = {}
    recordings[recording_id]['recording_id'] = recording_id
    recordings[recording_id]['filepath'] = filepath
    recordings[recording_id]['indexes'] = indexes
    recordings[recording_id]['indexes_length'] = len(indexes)

## Obteniendo los índices de locutores por audio

In [3]:
speaker_segments_limit = 2
for recording_id in recordings:
    recording = recordings[recording_id]
    file = open(recording['filepath'])
    segments = [(idx, json.loads(line)) for idx, line in enumerate(file.readlines()) if idx in recording['indexes']]
    file.close()
    recording['speakers_indexes'] = {}
    for idx, segment in segments:
        speaker_id = segment['speakers'][0]['speaker_id']
        if speaker_id in recording['speakers_indexes']:
            if len(recording['speakers_indexes'][speaker_id]) < speaker_segments_limit:
                recording['speakers_indexes'][speaker_id].append(idx)
        else:
            recording['speakers_indexes'][speaker_id] = [idx]

## Obteniendo la longitud de los datos

In [4]:
import itertools
import numpy

models_container_size = 5
recordings_length = 0
recordings_map = []

for recording_id in recordings:
    recording = recordings[recording_id]
    speakers_ids = list(recording['speakers_indexes'].keys())
    permutations = list(itertools.permutations(speakers_ids + ['0' for i in range(models_container_size)], models_container_size))
    permutations = list(set(permutations))
    recording['permutations'] = permutations
    permutations_length = 0
    permutations_map = []
    for idx, permutation in enumerate(permutations):
        length = int(numpy.prod([len(recording['speakers_indexes'][speaker_id]) for speaker_id in permutation if speaker_id in recording['speakers_indexes']]))
        permutations_map.append((permutations_length, permutations_length + length - 1, idx))
        permutations_length += length
    recording['permutations_length'] = permutations_length
    recording['permutations_map'] = permutations_map
    recording_length = permutations_length * recording['indexes_length']
    recording['recording_length'] = recording_length
    recordings_map.append((recordings_length, recordings_length + recording_length - 1, recording_id))
    recordings_length += recording_length

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

class Recordings_dataset(Dataset):
    def __init__(self, recordings, recordings_length, recordings_map, mode = 'ivectors'):
        self.recordings = recordings
        self.recordings_length = recordings_length
        self.recordings_map = recordings_map
        self.mode = mode
    def __len__(self):
        return self.recordings_length
    def __getitem__(self, idx):
        recording_tuple = list(filter(lambda recording_tuple: recording_tuple[0] <= idx and idx <= recording_tuple[1], self.recordings_map))[0]
        recording_idx = idx - recording_tuple[0]
        recording_id = recording_tuple[2] ###
        recording = self.recordings[recording_id]
        
        segment_idx, remainder = divmod(recording_idx, recording['permutations_length'])
        segment_id = recording['indexes'][segment_idx] ###
        
        permutation_tuple = list(filter(lambda permutation_tuple: permutation_tuple[0] <= remainder and remainder <= permutation_tuple[1], recording['permutations_map']))[0]
        permutation_idx = remainder - permutation_tuple[0]
        permutation_id = permutation_tuple[2] ###
        permutation = recording['permutations'][permutation_id]
                
        models_container = []
        for i, speaker_i in enumerate(permutation):
            if speaker_i != '0':
                else_length = int(numpy.prod([len(recording['speakers_indexes'][speaker_id]) for speaker_id in permutation[i + 1:] if speaker_id != '0']))
                permutation_idx, remainder = divmod(permutation_idx, else_length)
                models_container.append(recording['speakers_indexes'][speaker_i][permutation_idx])
                permutation_idx = remainder
            else:
                models_container.append(-1)
                
        file = open(recording['filepath'])
        lines = file.readlines()
        file.close()
                
        segment = json.loads(lines[segment_id])
        segment_vector = numpy.asarray(segment[self.mode][0]['value'])
        models_container = [numpy.zeros(len(segment_vector)) if segment_id == -1 else numpy.asarray(json.loads(lines[segment_id])[self.mode][0]['value']) for segment_id in models_container]
        permutation = numpy.asarray([speaker_id == segment['speakers'][0]['speaker_id'] for speaker_id in permutation], dtype = float)
        
        x = numpy.concatenate([segment_vector] + models_container)
        y = permutation
                
        return x, y

In [6]:
recordings_dataset = Recordings_dataset(recordings, recordings_length, recordings_map)
print(len(recordings_dataset))
train_length = int(len(recordings_dataset) * 0.7)
test_length = len(recordings_dataset) - train_length
train_dataset, test_dataset = random_split(recordings_dataset, [train_length, test_length])
train_dataloader = DataLoader(train_dataset, batch_size = 10, shuffle=True, num_workers = 4)
test_dataloader = DataLoader(test_dataset, batch_size = 1, shuffle=False, num_workers = 1)

3094915


In [7]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc0 = nn.Linear(128 * (models_container_size + 1), 512)
        self.fc1 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 512)
        self.fc5 = nn.Linear(512, 512)
        self.fc6 = nn.Linear(512, 512)
        self.fc7 = nn.Linear(512, models_container_size)
    def forward(self, x):
        x = F.relu(self.fc0(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.sigmoid(self.fc7(x))
        return x
    
net = Net().cuda()

In [None]:
import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr = 0.01)

#criterion = nn.CrossEntropyLoss()
#criterion = nn.MSELoss()
criterion = nn.BCELoss()

epochs = 3
for epoch in range(epochs):
    for input, target in train_dataloader:
        input = input.to('cuda', non_blocking=True)
        target = target.to('cuda', non_blocking=True)
        net.zero_grad()
        output = net(input.float())
        loss = criterion(output, target.float())
        loss.backward()
        optimizer.step()
        print(loss, end = '\r')
    if (loss < 0.3):
        break

tensor(0.4098, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)

In [None]:
with torch.no_grad():
    for input, target in test_dataloader:
        input = input.to('cuda', non_blocking=True)
        output = net(input.float())
        print('target:', target)
        print('output:', output)