In [1]:
import os
import json

def is_valid_segment(segment):
    return len(segment['speakers']) == 1 \
            and len(segment['ivectors']) == 1 \
            and len(segment['xvectors']) == 1 \
            and segment['speakers'][0]['speaker_id'] in ['A', 'B']

directory = '../exp/json'
filenames = [filename for filename in os.listdir(directory) if os.path.isfile(os.path.join(directory, filename))]

recordings_segments = {}
recordings_length = len(filenames)
recordings_count = 0
for filename in filenames:
    recording_id = filename.split('.')[0]
    filepath = os.path.join(directory, filename)
    file = open(filepath, 'r')
    recordings_segments[recording_id] = [json.loads(line) for line in file.readlines()]
    file.close()
    recordings_segments[recording_id] = list(filter(is_valid_segment, recordings_segments[recording_id]))
    recordings_count += 1
    print('Loading data: ' + str(recordings_count) + '/' + str(recordings_length), end = '\r')

Loading data: 249/249

## Balancing the dataset

In [2]:
from functools import reduce

def speakers_get_indexes(accumulator, speaker_tuple):
    speaker_id, index = speaker_tuple
    if speaker_id in accumulator:
        accumulator[speaker_id].append(index)
    else:
        accumulator[speaker_id] = [index]
    return accumulator

recordings_segments_cut = {}
for recording_id in recordings_segments:
    recording_segments = recordings_segments[recording_id]
    speakers_indexes = [(segment['speakers'][0]['speaker_id'], index) for index, segment in enumerate(recording_segments)]
    speakers_indexes = reduce(speakers_get_indexes, speakers_indexes, {})
    speakers_lengths = [(speaker_id, len(speakers_indexes[speaker_id])) for speaker_id in speakers_indexes]
    speakers_lengths.sort(key = lambda x: x[1])
    speakers_lengths_min = speakers_lengths[0][1]
    if len(speakers_lengths) > 1 and speakers_lengths_min > 20: # <-- IMPORTANT
        recording_indexes = []
        for speaker_id in speakers_indexes:
            speakers_indexes[speaker_id] = speakers_indexes[speaker_id][:speakers_lengths_min]
            recording_indexes += speakers_indexes[speaker_id]
        recordings_segments_cut[recording_id] = [segment for index, segment in enumerate(recordings_segments[recording_id]) if index in recording_indexes]    