In [1]:
import mxnet as mx
import numpy as np

from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import os
import sys
import math
import time
import pickle

proj_dir = os.getcwd()
label_dir = os.path.join(proj_dir, 'emotiondetection/features_labels_lld/labels')
lld_dir = os.path.join(proj_dir, 'emotiondetection/features_labels_lld/lld')

pickle_train_y_list = 'pickle_train_y_list'
pickle_train_x_list = 'pickle_train_x_list'

pickle_test_y_list = 'pickle_test_y_list'
pickle_test_x_list = 'pickle_test_x_list'

In [2]:
train_y_dict = {}
with open(os.path.join(label_dir, 'train.txt'), 'r') as f:
    for line in f.readlines():
        line = line.strip()
        line_list = line.split(' ')
        train_y_dict[line_list[0]] = line_list[1]
print 'train_y_dict: ', len(train_y_dict)

test_y_dict = {}
with open(os.path.join(label_dir, 'test.txt'), 'r') as f:
    for line in f.readlines():
        line_list = line.strip().split(' ')
        test_y_dict[line_list[0]] = line_list[1]
print 'test_y_dict: ', len(test_y_dict)

train_y_dict:  9959
test_y_dict:  8257


In [3]:
train_x_dict = {}
for file_name in os.listdir(os.path.join(lld_dir, 'train')):
    with open(os.path.join(lld_dir, 'train', file_name), 'r') as f:
        key = file_name.split('.')[0]
        train_x_dict[key] = [float(line.strip()) for line in f.readlines()]
print 'train_x_dict: ', len(train_x_dict)

test_x_dict = {}
for file_name in os.listdir(os.path.join(lld_dir, 'test')):
    with open(os.path.join(lld_dir, 'test', file_name), 'r') as f:
        key = file_name.split('.')[0]
        test_x_dict[key] = [float(line.strip()) for line in f.readlines()]
print 'test_x_dict: ', len(test_x_dict)

train_x_dict:  9959
test_x_dict:  8257


In [4]:
def label_to_int(lb):
    return ['A', 'E', 'N', 'P', 'R'].index(lb)

In [5]:
train_y_items = train_y_dict.items()
train_y_list = [val[1] for val in sorted(train_y_items)]
train_y_list = map(label_to_int, train_y_list)

train_x_items = train_x_dict.items()
train_x_list = [val[1] for val in sorted(train_x_items)]

test_y_items = test_y_dict.items()
test_y_list = [val[1] for val in sorted(test_y_items)]
test_y_list = map(label_to_int, test_y_list)

test_x_items = test_x_dict.items()
test_x_list = [val[1] for val in sorted(test_x_items)]

In [6]:
train_y_list = np.array(train_y_list)
train_x_list = np.array(train_x_list)

test_y_list = np.array(test_y_list)
test_x_list = np.array(test_x_list)

In [7]:
print train_y_list.shape, train_x_list.shape
print test_y_list.shape, test_x_list.shape

(9959,) (9959, 384)
(8257,) (8257, 384)


In [8]:
pickle.dump(train_y_list, open(pickle_train_y_list, 'w'))
pickle.dump(train_x_list, open(pickle_train_x_list, 'w'))

pickle.dump(test_y_list, open(pickle_test_y_list, 'w'))
pickle.dump(test_x_list, open(pickle_test_x_list, 'w'))

In [9]:
proj_dir = os.getcwd()
mfcc_dir = os.path.join(proj_dir, 'emotiondetection/mfcc_csv')

pickle_train_mfcc_x_list = 'pickle_train_mfcc_x_list'

pickle_test_mfcc_x_list = 'pickle_test_mfcc_x_list'

In [10]:
train_mfcc_x_dict = {}
for file_name in os.listdir(os.path.join(mfcc_dir, 'train')):
    with open(os.path.join(mfcc_dir, 'train', file_name), 'r') as f:
        key = file_name.split('.')[0]
        train_mfcc_x_dict[key] = np.loadtxt(f, delimiter=',')
print 'train_mfcc_x_dict: ', len(train_mfcc_x_dict)

test_mfcc_x_dict = {}
for file_name in os.listdir(os.path.join(mfcc_dir, 'test')):
    with open(os.path.join(mfcc_dir, 'test', file_name), 'r') as f:
        key = file_name.split('.')[0]
        test_mfcc_x_dict[key] = np.loadtxt(f, delimiter=',')
print 'test_x_dict: ', len(test_mfcc_x_dict)

train_mfcc_x_dict:  9959
test_x_dict:  8257


In [11]:
train_mfcc_x_items = train_mfcc_x_dict.items()
train_mfcc_x_list = [val[1] for val in sorted(train_mfcc_x_items)]

test_mfcc_x_items = test_mfcc_x_dict.items()
test_mfcc_x_list = [val[1] for val in sorted(test_mfcc_x_items)]

In [12]:
train_mfcc_x_list = np.array(train_mfcc_x_list)

test_mfcc_x_list = np.array(test_mfcc_x_list)

In [13]:
print train_y_list.shape, train_mfcc_x_list.shape
print test_y_list.shape, test_mfcc_x_list.shape

(9959,) (9959,)
(8257,) (8257,)


In [14]:
pickle.dump(train_mfcc_x_list, open(pickle_train_mfcc_x_list, 'w'))

pickle.dump(test_mfcc_x_list, open(pickle_test_mfcc_x_list, 'w'))

In [18]:
for i in range(5):
    print train_mfcc_x_list[i].shape

(104, 39)
(120, 39)
(237, 39)
(127, 39)
(101, 39)


# 对训练数据中某些数量比较多的类做降采样

In [2]:
train_y_list = pickle.load(open(pickle_train_y_list))
train_x_list = pickle.load(open(pickle_train_x_list))

test_y_list = pickle.load(open(pickle_test_y_list))
test_x_list = pickle.load(open(pickle_test_x_list))

# ENN down-sampling training data
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
enn = EditedNearestNeighbours()
# enn = SMOTEENN()

train_x_list_enn, train_y_list_enn = enn.fit_sample(train_x_list, train_y_list)

# split training data to 5 different classes
train_x_lists = [[] for i in range(5)]
train_y_lists = [[] for i in range(5)]
for i in xrange(len(train_y_list_enn)):
    train_x_lists[int(train_y_list_enn[i])].append(train_x_list_enn[i])
    train_y_lists[int(train_y_list_enn[i])].append(train_y_list_enn[i])
for i in xrange(5):
    print 'number of class', i, ':', len(train_x_lists[i])

train_x_lists_enn = train_x_lists
train_y_lists_enn = train_y_lists



number of class 0 : 3
number of class 1 : 59
number of class 2 : 1373
number of class 3 : 674
number of class 4 : 2


In [3]:
train_y_list = pickle.load(open(pickle_train_y_list))
train_x_list = pickle.load(open(pickle_train_x_list))

test_y_list = pickle.load(open(pickle_test_y_list))
test_x_list = pickle.load(open(pickle_test_x_list))

# split training data to 5 different classes
train_x_lists = [[] for i in range(5)]
train_y_lists = [[] for i in range(5)]
for i in xrange(len(train_y_list)):
    train_x_lists[int(train_y_list[i])].append(train_x_list[i])
    train_y_lists[int(train_y_list[i])].append(train_y_list[i])
for i in xrange(5):
    print 'number of class', i, ':', len(train_x_lists[i]), len(train_y_lists[i])
print '------------------'

train_x_lists[2] = train_x_lists_enn[2]
train_y_lists[2] = train_y_lists_enn[2]

train_x_list = np.vstack(train_x_lists)
train_y_list = np.concatenate(train_y_lists)

# split training data to 5 different classes
train_x_lists = [[] for i in range(5)]
for i in xrange(len(train_y_list)):
    train_x_lists[int(train_y_list[i])].append(train_x_list[i])
for i in xrange(5):
    print 'number of class', i, ':', len(train_x_lists[i])

number of class 0 : 881 881
number of class 1 : 2093 2093
number of class 2 : 5590 5590
number of class 3 : 674 674
number of class 4 : 721 721
------------------
number of class 0 : 881
number of class 1 : 2093
number of class 2 : 1373
number of class 3 : 674
number of class 4 : 721


In [4]:
pickle_train_x_small_list = 'pickle_train_x_small_list'
pickle_train_y_small_list = 'pickle_train_y_small_list'

pickle.dump(train_x_list, open(pickle_train_x_small_list, 'w'))
pickle.dump(train_y_list, open(pickle_train_y_small_list, 'w'))