In [9]:
import numpy as np
import IPython.display
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import shutil
from random import shuffle
import keras
import traceback
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from keras.utils import np_utils
import pickle
import json
from sklearn.mixture import GaussianMixture
from multiprocessing import Process
import traceback
%matplotlib inline

In [2]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

def read_png(path):
    img = mpimg.imread(path)
    return rgb2gray(img)

### Feature File Manager

In [3]:
class SpectrogramFeatureFileManager(object):
    def __init__(self, seconds=10, skip_augment=False):
        self.seconds = seconds
        self.skip_augment = skip_augment

    def get_file_credentials(self, path):
        values = path.split('#')[-1].split('.')[0]
        sr, winlen, winstep = map(int, values.split('='))
        return sr, winlen, winstep
    
    def _load_file(self, path):
        return read_png(path)

    def load(self, path):
        """ Return list of objects, ectracted from given file """
        if self.skip_augment and u'#AUG#' in path:
            return []

        try:
            file_data = self._load_file(path)
        except:
            return []

        sr, winlen, winstep = self.get_file_credentials(path)
        if winlen < winstep:
            winlen, winstep = winstep, winlen

        one_sec_count = sr / winlen
        chunk_size = one_sec_count * self.seconds

        height, width = file_data.shape
        
        # print file_data.shape, width, sr, winlen, winstep
        # print one_sec_count, chunk_size
        
        if width < chunk_size:
            return []
        
        result = []
        
        chunks = (width + chunk_size - 1) / chunk_size
        for chunk in xrange(chunks):
            start = (chunk * chunk_size)
            end = min(file_data.shape[1], (chunk + 1) * chunk_size)
            
            if end - start < chunk_size:
                break

            result.append(file_data[:, start:end])

        return result

In [4]:
# class MFCCFeatureFileManager(SpectrogramFeatureFileManager):
#     def get_file_credentials(self, path):
#         values = path.split('#')[-1].split('.')[0]
#         sr, winlen, winstep, _, _, _, _ = map(int, values.split('='))
#         return sr, winlen, winstep
    
#     def _load_file(self, path):
#         return np.loadtxt(path)

### Dataset manager

In [4]:
class FolderIterator(object):
    PATH_TEMPLATE = '/home/kolegor/Study/Master/data/features/{feature}/{language}/{dataset}/'

    def __init__(self, language, dataset, feature, uid, max_count=None):
        self.language = language
        self.dataset = dataset
        self.uid = uid
        self.max_count = max_count

        self.path = self.PATH_TEMPLATE.format(feature=feature, language=language, dataset=dataset)
    
    def get_data(self):
        x = [os.path.join(self.path, filename) for filename in os.listdir(self.path)]
        y = [self.uid for _ in xrange(len(x))]
        
        shuffle(x)
        shuffle(y)
        
        if self.max_count:
            x = x[:self.max_count]
            y = y[:self.max_count]

        return x, y

In [5]:
class Dataset(object):
    def __init__(self, folder_iterators, need_shuffle=True, manager=None, max_count=None):
        self.folders = folder_iterators
        self.manager = manager
        
        self.size_by_language = dict()
        
        self.all_x = []
        self.all_y = []
        
        for folder in self.folders:
            x, y = folder.get_data()
            if len(x) != len(y):
                raise Exception('What the hell?')
            
            self.all_x.extend(x)
            self.all_y.extend(y)
            self.size_by_language[folder.language] = len(x)
        
        if shuffle:
            combined = zip(self.all_x, self.all_y)
            shuffle(combined)
            self.all_x, self.all_y = zip(*combined)
        
        if max_count is not None:
            self.all_x = self.all_x[:max_count]
            self.all_y = self.all_y[:max_count]

        self.classes = len(set(self.all_y))
        self.size = len(self.all_x)

    def one_sample(self):
        qwe = []
        while not qwe:
            qwe = self.manager.load(self.all_x[0])
            if qwe:
                qwe = qwe[0]
                break
        return qwe


def dataset_iterator(dataset, chunk_size, as_np=False, extend=True, y_categorical=True, iters=10):
    for _ in xrange(iters):
        cur_chunk_x = []
        cur_chunk_y = []

        for filepath, uid in zip(dataset.all_x, dataset.all_y):
            loaded = dataset.manager.load(filepath)

            cur_chunk_x.extend(loaded)
            cur_chunk_y.extend([uid for _ in xrange(len(loaded))])

            if len(cur_chunk_x) >= chunk_size:
                to_return_x = cur_chunk_x[:chunk_size]
                to_return_y = cur_chunk_y[:chunk_size]

                cur_chunk_x = cur_chunk_x[chunk_size:]
                cur_chunk_y = cur_chunk_y[chunk_size:]

                if as_np:
                    to_return_x = np.array(to_return_x)
                    to_return_y = np.array(to_return_y)

                if extend:
                    to_return_x = to_return_x.reshape(to_return_x.shape + (1,))

                if y_categorical:
                    to_return_y = np_utils.to_categorical(to_return_y, dataset.classes)

                yield to_return_x, to_return_y


def load_all_dataset(dataset, as_np=False, extend=True, y_categorical=True):
    all_x = []
    all_y = []
    
    iterator = dataset_iterator(dataset, 32, as_np=False, extend=False, y_categorical=False, iters=1)
    for x, y in iterator:
        all_x.extend(x)
        all_y.extend(y)
        
    all_x = np.array(all_x)
    all_y = np.array(all_y)
    
    if as_np:
        all_x = np.array(all_x)
        all_y = np.array(all_y)

    if extend:
        all_x = all_x.reshape(all_x.shape + (1,))

    if y_categorical:
        all_y = np_utils.to_categorical(all_y, dataset.classes)

    return all_x, all_y

In [6]:
def get_languages_dataset(feature, languages, max_per_class=(None, None, None)):
    lang2uid = dict((b, a) for a, b in enumerate(languages))
    train = Dataset([
        FolderIterator(language, 'train', feature, lang2uid[language], max_count=max_per_class[0])
        for language in languages
    ])
    test = Dataset([
        FolderIterator(language, 'test', feature, lang2uid[language], max_count=max_per_class[1])
        for language in languages
    ])
    dev = Dataset([
        FolderIterator(language, 'dev', feature, lang2uid[language], max_count=max_per_class[2])
        for language in languages
    ])
    return train, test, dev, lang2uid

In [7]:
def print_languages_info(train_ds, test_ds, dev_ds):
    print u'Languages: [{}]'.format(u' # '.join(train_ds.size_by_language.keys()))
    print u'Classes: {}'.format(train_ds.classes)
    print u'Sizes: (train/test/dev): {}/{}/{}'.format(train_ds.size, test_ds.size, dev_ds.size)

### Model build

In [8]:
class Experiment(object):
    def __init__(
        self, feature, languages, manager,
        batch_size=32, max_per_class=(None, None, None), uid=None, full_load=False
    ):
        self.batch_size = 32
        self.max_per_class = max_per_class
        self.uid = uid
        self.full_load = full_load

        self.train_ds = None
        self.test_ds = None
        self.dev_ds = None
        self.lang2uid = None
        
        self.train_ds, self.test_ds, self.dev_ds, self.lang2uid = get_languages_dataset(
            feature,
            languages,
            self.max_per_class
        )
        print_languages_info(self.train_ds, self.test_ds, self.dev_ds)
        
        self.train_ds.manager = manager
        self.test_ds.manager = manager
        self.dev_ds.manager = manager
        
        self.X_train, self.Y_train = None, None
        self.X_test, self.Y_test = None, None
        self.X_dev, self.Y_dev = None, None
        
        if full_load:
            self.X_train, self.Y_train = load_all_dataset(self.train_ds, as_np=True)
            self.X_test, self.Y_test = load_all_dataset(self.test_ds, as_np=True)
            self.X_dev, self.Y_dev = load_all_dataset(self.dev_ds, as_np=True)
            
            print u'Full load. Train: {}, Test: {}, Dev: {}'.format(
                len(self.X_train), len(self.X_test), len(self.X_dev)
            )
        
        self.model = None
        self.fit_result = None
        self.evaluate_result = None

In [10]:
class CNNExperiment(Experiment):    
    def init_model(self):
        if self.full_load:
            sample = self.X_train[0]
        else:
            sample = self.train_ds.one_sample()
        
        input_shape = (sample.shape[0], sample.shape[1], 1)
        print input_shape

        pool_size=(2, 2)

        model = Sequential()
        model.add(Conv2D(16, (7, 7), activation='relu', input_shape=input_shape))
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(BatchNormalization())
        model.add(Conv2D(32, (5, 5), activation='relu'))
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(BatchNormalization())
        model.add(Conv2D(64, (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(BatchNormalization())
        model.add(Conv2D(128, (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(BatchNormalization())
        model.add(Conv2D(128, (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(BatchNormalization())

        # print model.summary()
        # model.add(Conv2D(256, (3, 3), activation='relu'))
        # model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.train_ds.classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        self.model = model
    
    def fit(self, epochs=4):
        if self.full_load:
            self.fit_result = self.model.fit(
                x=self.X_train,
                y=self.Y_train,
                epochs=epochs,
                batch_size=self.batch_size,
                validation_data=(self.X_dev, self.Y_dev),
            )
        else:
            self.fit_result = self.model.fit_generator(
                dataset_iterator(self.train_ds, self.batch_size, as_np=True, extend=True),
                steps_per_epoch=len(self.train_ds.all_x) / self.batch_size,
                epochs=epochs,
                validation_data=dataset_iterator(self.dev_ds, self.batch_size, as_np=True, extend=True),
                validation_steps=len(self.dev_ds.all_x) / self.batch_size,
            )
            
    
    def evaluate(self):
        if self.full_load:
            self.evaluate_result = self.model.evaluate(
                x=self.X_test,
                y=self.X_test,
            )
        else:
            self.evaluate_result = self.model.evaluate_generator(
                dataset_iterator(self.test_ds, self.batch_size, as_np=True, extend=True),
                steps=len(self.test_ds.all_x) / self.batch_size
            )
    
    def save(self, uid=None):
        use_uid = uid
        if use_uid is None:
            use_uid = self.uid
        if use_uid is None:
            raise Exception('UID ex empty')
        
        with open('/home/kolegor/result.{}.pickle'.format(use_uid), 'w') as outf:
            pickle.dump([self.fit_result.history, self.evaluate_result], outf)
        
        self.model.save('/home/kolegor/model.{}.pickle'.format(use_uid))

In [11]:
def run_experiment(exp, uid=None, cleanup=False):
    try:
        print u'\nINIT MODEL'
        exp.init_model()

        print u'\nFIT'
        exp.fit()

        print u'\nEVALUATE'
        exp.evaluate()

        print u'\nSAVE'
        exp.save(uid=uid)

        if cleanup:
            del exp.model
            del exp.train_ds
            del exp.test_ds
            del exp.dev_ds
            del exp.X_train, exp.Y_train, exp.X_test, exp.Y_test, exp.X_dev, exp.Y_dev
    except:
        print traceback.format_exc()

In [12]:
# exp0_manager = SpectrogramFeatureFileManager(seconds=5, skip_augment=False)
# exp0_languages = ['sa_afrikaans', 'nepali']
# exp0 = CNNExperiment('spectrogram', exp0_languages, exp0_manager, max_per_class=(200, 100, 500), full_load=True)
# exp0.uid = u'cnn__spectr__{}__6000_2000_500__5sec__aug'.format(u' # '.join(exp0_languages))

In [13]:
exp1_manager = SpectrogramFeatureFileManager(seconds=9, skip_augment=False)
exp1_languages = ['ab_English', 'ab_Russian']
exp1 = CNNExperiment('spectrogram', exp1_languages, exp1_manager, max_per_class=(20000, 10000, 5000))
exp1.uid = u'cnn__spectr__{}__20000_10000_5000__9sec__aug'.format(u'#'.join(exp1_languages))

Languages: [ab_Russian # ab_English]
Classes: 2
Sizes: (train/test/dev): 26829/10563/2786


In [14]:
exp2_manager = SpectrogramFeatureFileManager(seconds=9, skip_augment=True)
exp2_languages = ['ab_English', 'ab_Russian']
exp2 = CNNExperiment('spectrogram', exp2_languages, exp2_manager, max_per_class=(20000, 10000, 5000))
exp2.uid = u'cnn__spectr__{}__20000_10000_5000__9sec__NO_aug'.format(u'#'.join(exp2_languages))

Languages: [ab_Russian # ab_English]
Classes: 2
Sizes: (train/test/dev): 26829/10563/2786


In [15]:
exp3_manager = SpectrogramFeatureFileManager(seconds=2, skip_augment=False)
exp3_languages = ['sa_afrikaans', 'khmer', 'nepali', 'sa_sesotho',]
exp3 = CNNExperiment('spectrogram', exp3_languages, exp3_manager, max_per_class=(7000, 2000, 500))
exp3.uid = u'cnn__spectr__{}__7000_2000_500__2sec__aug'.format(u'#'.join(exp3_languages))

Languages: [khmer # sa_afrikaans # nepali # sa_sesotho]
Classes: 4
Sizes: (train/test/dev): 22013/6290/2000


In [16]:
exp4_manager = SpectrogramFeatureFileManager(seconds=5, skip_augment=False)
exp4_languages = ['sa_afrikaans', 'khmer', 'nepali', 'sa_sesotho']
exp4 = CNNExperiment('spectrogram', exp4_languages, exp4_manager, max_per_class=(7000, 2000, 500))
exp4.uid = u'cnn__spectr__{}__7000_2000_500__5sec__aug'.format(u'#'.join(exp4_languages))

Languages: [khmer # sa_afrikaans # nepali # sa_sesotho]
Classes: 4
Sizes: (train/test/dev): 22013/6290/2000


In [17]:
exp5_manager = SpectrogramFeatureFileManager(seconds=3, skip_augment=True)
exp5_languages = ['sa_afrikaans', 'khmer', 'nepali', 'sa_sesotho']
exp5 = CNNExperiment('spectrogram', exp5_languages, exp5_manager, max_per_class=(7000, 2000, 500))
exp5.uid = u'cnn__spectr__{}__7000_2000_500__3sec__NO_aug'.format(u'#'.join(exp5_languages))

Languages: [khmer # sa_afrikaans # nepali # sa_sesotho]
Classes: 4
Sizes: (train/test/dev): 22013/6290/2000


In [18]:
exp6_manager = SpectrogramFeatureFileManager(seconds=5, skip_augment=False)
exp6_languages = ['ab_English', 'ab_Russian', 'ab_Hebrew', 'ab_German', 'ab_Poland', 'ab_Finnish']
exp6 = CNNExperiment('spectrogram', exp6_languages, exp6_manager, max_per_class=(6000, 2500, 1000))
exp6.uid = u'cnn__spectr__{}__5000_2500_1000__5sec__aug'.format(u'#'.join(exp6_languages))

Languages: [ab_Poland # ab_Finnish # ab_Hebrew # ab_English # ab_German # ab_Russian]
Classes: 6
Sizes: (train/test/dev): 36000/15000/6000


In [19]:
exp7_manager = SpectrogramFeatureFileManager(seconds=5, skip_augment=False)
exp7_languages = ['ab_English', 'ab_Russian', 'ab_Hebrew', 'ab_German', 'ab_Finnish']
exp7 = CNNExperiment('spectrogram', exp7_languages, exp7_manager, max_per_class=(6000, 2500, 1000))
exp7.uid = u'cnn__spectr__{}__5000_2500_1000__5sec__aug'.format(u'#'.join(exp7_languages))

Languages: [ab_Russian # ab_German # ab_Hebrew # ab_English # ab_Finnish]
Classes: 5
Sizes: (train/test/dev): 30000/12500/5000


In [20]:
exp8_manager = SpectrogramFeatureFileManager(seconds=7, skip_augment=False)
exp8_languages = ['ab_English', 'ab_Russian', 'ab_Hebrew', 'ab_German', 'ab_Finnish']
exp8 = CNNExperiment('spectrogram', exp8_languages, exp8_manager, max_per_class=(6000, 2500, 1000))
exp8.uid = u'cnn__spectr__{}__5000_2500_1000__7sec__aug'.format(u'#'.join(exp8_languages))

Languages: [ab_Russian # ab_German # ab_Hebrew # ab_English # ab_Finnish]
Classes: 5
Sizes: (train/test/dev): 30000/12500/5000


In [21]:
run_experiment(exp4, cleanup=True)
# run_experiment(exp2, cleanup=True)
# run_experiment(exp3, cleanup=True)
# run_experiment(exp6, cleanup=True)
# run_experiment(exp7, cleanup=True)
run_experiment(exp8, cleanup=True)


INIT MODEL
(128, 200, 1)

FIT
Epoch 1/4
Epoch 2/4
Epoch 3/4
  File "<ipython-input-11-281bb86d6846>", line 7, in run_experiment
    exp.fit()
  File "<ipython-input-10-550c02a52a86>", line 58, in fit
    validation_steps=len(self.dev_ds.all_x) / self.batch_size,
  File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1415, in fit_generator
    initial_epoch=initial_epoch)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training_generator.py", line 230, in fit_generator
    workers=0)
  File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1469, in evaluate_generator
    verbose=verbose)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training_generator.py", line 

In [22]:
path1 = '/home/kolegor/result.cnn__spectr__ab_English#ab_Russian#ab_Hebrew#ab_German#ab_Finnish__5000_2500_1000__3sec__aug.pickle'
path2 = '/home/kolegor/result.cnn__spectr__ab_English#ab_Russian#ab_Hebrew#ab_German#ab_Finnish__5000_2500_1000__5sec__aug.pickle'
path3 = '/home/kolegor/result.cnn__spectr__ab_English#ab_Russian#ab_Hebrew#ab_German#ab_Finnish__5000_2500_1000__7sec__aug.pickle'

In [23]:
use_path = path3
with open(use_path, 'r') as inf:
    qwe = pickle.load(inf)
qwe

[{'acc': [0.7474986659551761,
   0.9545424226254002,
   0.9713513874066169,
   0.9802561366061899],
  'loss': [0.6101389460070824,
   0.13508696147584104,
   0.08417417824103347,
   0.060734387023224014],
  'val_acc': [0.5612980769230769,
   0.6426282051282052,
   0.6225961538461539,
   0.6546474358974359],
  'val_loss': [2.521544595559438,
   2.344950178112739,
   2.7127186999871182,
   3.1981816553534608]},
 [3.457260944904425, 0.6766826923076923]]

'/home/kolegor/result.cnn__spectr__ab_English#ab_Russian#ab_Hebrew#ab_German#ab_Finnish__5000_2500_1000__5sec__aug.pickle'

[{'acc': [0.6573505869797225,
   0.9305296157950907,
   0.9543756670224119,
   0.9617796157950907],
  'loss': [0.8304459522612464,
   0.20158600124576215,
   0.13396440932730694,
   0.11191276282100891],
  'val_acc': [0.6614583333333334,
   0.6149839743589743,
   0.6508413461538461,
   0.6598557692307693],
  'val_loss': [2.3027218018586817,
   2.7794124357020245,
   2.3347269101784778,
   2.52298259587051]},
 [2.7109747937808817, 0.6872596153846153]]

### GMM Grid Search

In [43]:
def get_language_dataset_data(dataset, max_count, manager=None):    
    x = []
    
    for i, filepath in enumerate(dataset.all_x):
        if i % 500 == 0:
            print ' -', i, max_count, len(x)

        loaded = dataset.manager.load(filepath)
        x.extend(loaded)
        
        if len(x) > max_count:
            break
    
    x = x[:max_count]
    return np.vstack(x)


def load_gmm_data(language, dataset, max_count, manager):
    dataset = Dataset([FolderIterator(language, dataset, 'mfcc', -1, max_count=None)])
    dataset.manager = manager
    return get_language_dataset_data(dataset, max_count)

In [44]:
def create_all_gmm_models(languages, components):
    model_by_language = dict()
    for language in languages:
        gmm = GaussianMixture(n_components=components, init_params='random', tol=1e-3, max_iter=50)
        model_by_language[language] = gmm
    return model_by_language


def fit_all_gmm_models(model_by_language, components, manager, max_count):
    for i, language in enumerate(model_by_language):
        gmm = model_by_language[language]
        
        print ' - Load X {}/{}'.format(i + 1, len(model_by_language))
        X = load_gmm_data(language, 'train', max_count, manager)
        
        print ' - Fit {}/{}'.format(i + 1, len(model_by_language))
        gmm = gmm.fit(X)
        del X
        
        model_by_language[language] = gmm

In [45]:
def load_test_data(languages, max_count, manager):
    X, Y = [], []
    
    for language in languages:
        lang_x = load_gmm_data(language, 'test', max_count, manager)
        lang_y = [language for _ in xrange(len(lang_x))]
        X.extend(lang_x)
        Y.extend(lang_y)
    
    X = X[:max_count]
    Y = Y[:max_count]

    X = np.array(X)
    Y = np.array(Y)

    return X, Y


def run_gmm_test(model_by_language, languages, max_count, manager):
    print u'Loading TEST'
    X_test, Y_test = load_test_data(languages, max_count, manager)
    
    predicted_by_language = dict()
    for language, gmm in model_by_language.iteritems():
        predicted_by_language[language] = gmm.score_samples(X_test)
    
    return Y_test, predicted_by_language


def evaluate_predict_results(real, predicted_by_language):
    results = []  # list of tuples (real_lang, [sorted predited_langs])
    
    for i, real_lang in enumerate(real):
        predictions = [(pr[i], language) for language, pr in predicted_by_language.iteritems()]
        best_val, best_lang = sorted(predictions)[-1]
        results.append((real_lang, best_lang))
    
    correct = 0
    incorrect = 0
    
    for a, b in results:
        if a == b:
            correct += 1
        else:
            incorrect += 1
    
    return [results, correct, incorrect, len(real), float(correct) / len(real), float(incorrect) / len(real)]

In [46]:
class GMMExperiment(object):
    def __init__(self, languages, components, train_max_count, test_max_count, seconds=10, skip_augment=False):
        self.manager = MFCCFeatureFileManager(seconds=seconds, skip_augment=skip_augment)
        
        self.languages = languages
        self.components = components
        self.train_max_count = train_max_count
        self.test_max_count = test_max_count
        
        self.model_by_language = create_all_gmm_models(self.languages, self.components)
        self.test_result = None
        
        self.model_uid = u'gmm#{}#{}#{}#{}#{}#{}'.format(
            u'$'.join(self.languages), self.components, self.train_max_count, self.test_max_count,
            self.manager.seconds, self.manager.skip_augment
        )
    
    def fit(self):
        fit_all_gmm_models(self.model_by_language, self.components, self.manager, self.train_max_count)
    
    def evaluate(self):
        real, predicted_by_language = run_gmm_test(
            self.model_by_language,
            self.languages,
            self.test_max_count * len(self.languages),
            self.manager
        )
        evaluated_results = evaluate_predict_results(real, predicted_by_language)
        self.test_result = {'real': list(real), 'predicted': predicted_by_language, 'eval': evaluated_results}
        print evaluated_results[1:]

    def save(self):
        with open('/home/kolegor/{}.model.pickle'.format(self.model_uid), 'wb') as outf:
            pickle.dump(self.model_by_language, outf)
        with open('/home/kolegor/{}.results.pickle'.format(self.model_uid), 'wb') as outf:
            pickle.dump(self.test_result, outf)
    
    def cleanup(self):
        del self.model_by_language
        del self.test_result

In [47]:
# gexp0 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 64, 3000, 600, seconds=4, skip_augment=False)
# gexp1 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 256, 3000, 600, seconds=4, skip_augment=False)
# gexp2 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 1024, 3000, 600, seconds=4, skip_augment=False)
# gexp3 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 256, 3000, 600, seconds=4, skip_augment=False)
# gexp4 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 1024, 3000, 600, seconds=4, skip_augment=True)
# gexp5 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 256, 3000, 600, seconds=4, skip_augment=True)

In [48]:
many_languages = ['ab_English', 'ab_German', 'ab_Russian', 'khmer', 'nepali', 'sa_afrikaans', 'sa_sesotho']
gexp6 = GMMExperiment(many_languages, 256, 2000, 600, seconds=3, skip_augment=False)
# gexp7 = GMMExperiment(many_languages, 2048, 2000, 500, seconds=3, skip_augment=False)

In [49]:
big_languages = ['ab_English', 'ab_German', 'ab_Russian']
# gexp8 = GMMExperiment(big_languages, 64, 3000, 500, seconds=9, skip_augment=False)
# gexp9 = GMMExperiment(big_languages, 256, 3000, 500, seconds=9, skip_augment=False)
# gexp10 = GMMExperiment(big_languages, 1024, 3000, 500, seconds=9, skip_augment=False)
gexp11 = GMMExperiment(big_languages, 256, 3000, 700, seconds=9, skip_augment=False)
# gexp12 = GMMExperiment(big_languages, 1024, 3000, 500, seconds=9, skip_augment=True)
gexp13 = GMMExperiment(big_languages, 256, 3000, 700, seconds=9, skip_augment=True)

In [50]:
big_languages_2 = ['ab_English', 'ab_German', 'ab_Russian', 'ab_Hebrew', 'ab_Danish']
# gexp14 = GMMExperiment(big_languages_2, 64, 2000, 400, seconds=9, skip_augment=False)
# gexp15 = GMMExperiment(big_languages_2, 256, 2000, 400, seconds=9, skip_augment=False)
gexp16 = GMMExperiment(big_languages_2, 256, 2000, 600, seconds=9, skip_augment=False)

In [51]:
all_experiments = [
    gexp6, gexp11, gexp13, gexp16
]

In [52]:
def run_one_proc_experiments(exps, uid):
    for i, exp in enumerate(exps):
        print u'START. PROC {}. EXP {}/{}.'.format(uid, i + 1, len(exps))
        
        try:
            print u' - FIT ({}, {}/{})'.format(uid, i + 1, len(exps))
            exp.fit()
            print u' - EVAL ({}, {}/{})'.format(uid, i + 1, len(exps))
            exp.evaluate()
            print u' - SAVE ({}, {}/{})'.format(uid, i + 1, len(exps))
            exp.save()
            exp.cleanup()
        except:
            print traceback.format_exc()
            print u' * EXCEPTION. PROC {}. EXP {}/{}. SKIP'.format(uid, i + 1, len(exps))
            continue
    

def run_experiments_procs(processes=4):
    procs = []

    chunk_size = (len(all_experiments) + processes - 1) / processes
    for chunk in xrange(processes):
        chunk_exps = all_experiments[chunk * chunk_size:min(len(all_experiments), (chunk + 1) * chunk_size)]

        proc = Process(target=run_one_proc_experiments, args=(chunk_exps, chunk))
        procs.append(proc)
        proc.start()

    # complete the processes
    for proc in procs:
        proc.join()

In [53]:
run_experiments_procs(processes=1)

START. PROC 0. EXP 1/4.
 - FIT (0, 1/4)
 - Load X 1/7
 - 0 2000 0
 - 500 2000 1164
 - Fit 1/7




 - Load X 2/7
 - 0 2000 0
 - Fit 2/7
 - Load X 3/7
 - 0 2000 0
 - 500 2000 1173
 - Fit 3/7
 - Load X 4/7
 - 0 2000 0
 - 500 2000 1333
 - Fit 4/7
 - Load X 5/7
 - 0 2000 0
 - Fit 5/7
 - Load X 6/7
 - 0 2000 0
 - 500 2000 888
 - 1000 2000 1784
 - Fit 6/7
 - Load X 7/7
 - 0 2000 0
 - Fit 7/7
 - EVAL (0, 1/4)
Loading TEST
 - 0 4200 0
 - 500 4200 2464
 - 0 4200 0
 - 500 4200 2457
 - 0 4200 0
 - 500 4200 2497
 - 0 4200 0
 - 500 4200 1113
 - 1000 4200 2241
 - 1500 4200 3360
 - 0 4200 0
 - 500 4200 1241
 - 1000 4200 2480
 - 1500 4200 3748
 - 0 4200 0
 - 500 4200 964
 - 1000 4200 1902
 - 0 4200 0
 - 500 4200 1368
 - 1000 4200 2768
 - 1500 4200 4116
[673, 3527, 4200, 0.16023809523809524, 0.8397619047619047]
 - SAVE (0, 1/4)
START. PROC 0. EXP 2/4.
 - FIT (0, 2/4)
 - Load X 1/3
 - 0 3000 0
 - 500 3000 568
 - 1000 3000 1119
 - 1500 3000 1671
 - 2000 3000 2243
 - 2500 3000 2796
 - Fit 1/3
 - Load X 2/3
 - 0 3000 0
 - 500 3000 597
 - 1000 3000 1185
 - 1500 3000 1780
 - 2000 3000 2375
 - 2500 3000 29

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



 * EXCEPTION. PROC 0. EXP 3/4. SKIP
START. PROC 0. EXP 4/4.
 - FIT (0, 4/4)
 - Load X 1/5
 - 0 2000 0
Traceback (most recent call last):
  File "<ipython-input-52-cd7858913bff>", line 7, in run_one_proc_experiments
    exp.fit()
  File "<ipython-input-46-a70ba2927853>", line 19, in fit
    fit_all_gmm_models(self.model_by_language, self.components, self.manager, self.train_max_count)
  File "<ipython-input-44-f0c7fa2f26cb>", line 14, in fit_all_gmm_models
    X = load_gmm_data(language, 'train', max_count, manager)
  File "<ipython-input-43-f6f2f0551502>", line 21, in load_gmm_data
    return get_language_dataset_data(dataset, max_count)
  File "<ipython-input-43-f6f2f0551502>", line 8, in get_language_dataset_data
    loaded = dataset.manager.load(filepath)
  File "<ipython-input-5-29d13f746c78>", line 19, in load
    file_data = self._load_file(path)
  File "<ipython-input-6-ef8856755d51>", line 8, in _load_file
    return np.loadtxt(path)
  File "/usr/local/lib/python2.7/dist-packag

IndexError: string index out of range

## Experiment results

In [55]:
path1 = '/home/kolegor/result.cnn__spectr__ab_English#ab_Russian__20000_10000_5000__9sec__aug.pickle'
with open(path1, 'r') as inf:
    data1 = pickle.load(inf)
data1

[{'acc': [0.9464139344262295,
   0.9881147540983607,
   0.9914617486338798,
   0.9934084699453551],
  'loss': [0.1342539332273971,
   0.03442741199977832,
   0.025367211781744257,
   0.019217864393748806],
  'val_acc': [0.6178030303030303,
   0.6566287878787879,
   0.46912878787878787,
   0.6056818181818182],
  'val_loss': [3.205211460229122,
   3.1818838964809073,
   3.7943293333053587,
   4.1201699957703095]},
 [3.0749430269906015, 0.734469696969697]]

In [56]:
path2 = '/home/kolegor/result.cnn__spectr__ab_English#ab_Russian__20000_10000_5000__9sec__NO_aug.pickle'
with open(path2, 'r') as inf:
    data2 = pickle.load(inf)
data2

[{'acc': [0.9546106557377049,
   0.9942964480874317,
   0.9967213114754099,
   0.9974043715846994],
  'loss': [0.12049935693097813,
   0.01853571963003406,
   0.009686795414060782,
   0.008042469599610648],
  'val_acc': [0.506439393939394,
   0.5776515151515151,
   0.5547348484848484,
   0.5356060606060606],
  'val_loss': [3.128516874891339,
   3.791149545438362,
   4.508427002935699,
   4.903973020206799]},
 [4.127637905785532, 0.6269886363636363]]

In [57]:
path3 = '/home/kolegor/result.cnn__spectr__sa_afrikaans#khmer#nepali#sa_sesotho__7000_2000_500__2sec__aug.pickle'
with open(path3, 'r') as inf:
    data3 = pickle.load(inf)
data3

[{'acc': [0.7020105531295487,
   0.9098890101892285,
   0.9388191411935953,
   0.9513282387190685],
  'loss': [0.6875336158375379,
   0.23914713782883282,
   0.1645430876891057,
   0.12773855280922794],
  'val_acc': [0.7777217741935484,
   0.6875,
   0.8150201612903226,
   0.8568548387096774],
  'val_loss': [0.5415891235874545,
   0.8141414729818222,
   0.5098349149188688,
   0.42031823735563983]},
 [0.28748251347593506, 0.9033801020408163]]

In [None]:
def mean_gmm_results(real, predicted_by_language):
    results = []  # list of tuples (real_lang, predicted_lang)
    
    for i, real_lang in enumerate(real):
        predictions = [(pr[i], language) for language, pr in predicted_by_language.iteritems()]
        best_val, best_lang = sorted(predictions)[-1]
        results.append((real_lang, best_lang))
    
    correct = 0
    incorrect = 0
    
    for a, b in results:
        if a == b:
            correct += 1
        else:
            incorrect += 1
    
    return [results, correct, incorrect, len(real), float(correct) / len(real), float(incorrect) / len(real)]

In [63]:
path4 = '/home/kolegor/gmm#sa_isiXhosa$sa_sesotho#512#3000#600#4#True.results.pickle'
with open(path4, 'r') as inf:
    data4 = pickle.load(inf)
print len(data4['predicted']['sa_isiXhosa'])

1200


In [None]:
print data4['eval']
print mean_gmm_results(data4['real'], data4['predicted'])

#### 1

languages = ['ab_Portuguese', 'ab_Russian', 'ab_English', 'ab_French', 'ab_German']
seconds = 5, skip_augment = False

Epoch 1/4
1419/1419 [==============================] - 5687s 4s/step - loss: 0.4233 - acc: 0.8370 - val_loss: 3.0405 - val_acc: 0.5762
Epoch 2/4
1419/1419 [==============================] - 5266s 4s/step - loss: 0.1159 - acc: 0.9614 - val_loss: 3.1389 - val_acc: 0.5513
Epoch 3/4
1419/1419 [==============================] - 5178s 4s/step - loss: 0.0742 - acc: 0.9752 - val_loss: 2.5753 - val_acc: 0.6113
Epoch 4/4
1419/1419 [==============================] - 5163s 4s/step - loss: 0.0564 - acc: 0.9814 - val_loss: 3.3037 - val_acc: 0.5399


gexp0 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 64, 3000, 600, seconds=4, skip_augment=False)
gexp1 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 256, 3000, 600, seconds=4, skip_augment=False)
[743, 457, 1200, 0.6191666666666666, 0.38083333333333336]
[800, 400, 1200, 0.6666666666666666, 0.3333333333333333]

gexp3 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 512, 3000, 600, seconds=4, skip_augment=False)
[855, 345, 1200, 0.7125, 0.2875]

gexp5 = GMMExperiment(['sa_isiXhosa', 'sa_sesotho'], 512, 3000, 600, seconds=4, skip_augment=True)
[849, 351, 1200, 0.7075, 0.2925]

many_languages = ['ab_English', 'ab_German', 'ab_Russian', 'khmer', 'nepali', 'sa_afrikaans', 'sa_sesotho']
gexp6 = GMMExperiment(many_languages, 512, 2000, 500, seconds=3, skip_augment=False)
[666, 2834, 3500, 0.19028571428571428, 0.8097142857142857]

big_languages = ['ab_English', 'ab_German', 'ab_Russian']
gexp11 = GMMExperiment(big_languages, 512, 3000, 500, seconds=9, skip_augment=False)
[552, 948, 1500, 0.368, 0.632]

big_languages = ['ab_English', 'ab_German', 'ab_Russian']
gexp13 = GMMExperiment(big_languages, 512, 3000, 500, seconds=9, skip_augment=True)
[520, 980, 1500, 0.3466666666666667, 0.6533333333333333]

big_languages_2 = ['ab_English', 'ab_German', 'ab_Russian', 'ab_Hebrew', 'ab_Danish']
gexp16 = GMMExperiment(big_languages_2, 512, 2000, 400, seconds=9, skip_augment=False)
[408, 1592, 2000, 0.204, 0.796]