In [1]:
# Use CPU only
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [1]:
import keras
import os
import numpy as np
from keras.models import load_model
from matplotlib import pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
project_dir = '/home/rabreu/projeto_multimodal/'
data_dir = project_dir+'data/'

audioset_indices_csv = project_dir+'segments/subset_class_labels_indices.csv'

## pegando dataset desbalanceado, que eu balanceei para 600 ocorrências para cada classe
audioset_train_csv = project_dir+'segments/2000max_subset_unbalanced_train_segments.csv'
audio_train_sub_dir = '2000unbalanced_train/audio'
video_train_sub_dir = '2000unbalanced_train/video'

audioset_eval_csv = project_dir+'segments/subset_eval_segments.csv'
audio_eval_sub_dir = 'eval/audio'
video_eval_sub_dir = 'eval/video'

### Carregando dados avaliação

In [3]:

## audio
save_dir = data_dir+audio_eval_sub_dir+"/2000features/cnn/"
feature_file = os.path.join(save_dir+'_x.npy')
labels_file = os.path.join(save_dir+'_y.npy')
audio_eval_x = np.load(feature_file)
## train_y and eval_y are only loaded once, since they are the same for audio and video
eval_y = np.load(labels_file)
print(audio_eval_x.shape)

audio_eval_x = np.expand_dims(audio_eval_x, axis=-1)



## video
save_dir = data_dir+video_eval_sub_dir+"/features/"
feature_file = os.path.join(save_dir+'_x.npy')
video_eval_x = np.load(feature_file)
print(video_eval_x.shape)

## Mapeando valores de pixels para o espaço entre 0 e 1
video_eval_x = video_eval_x.astype('float32')
video_eval_x /= 255

(532, 60, 420)
(532, 32, 32, 32, 3)


### Carregando modelos e predizendo o conjunto de validação

In [4]:
audio_model = load_model('modelos_salvos/2000audio_cnn.h5')
video_model = load_model('modelos_salvos/2000video_cnn.h5')

In [5]:
from utils import top_3_accuracy
multimodal_model = load_model('modelos_salvos/multimodal.h5', custom_objects={'top_3_accuracy': top_3_accuracy}) 

In [6]:
audio_predictions = audio_model.predict([audio_eval_x])
audio_predictions[audio_predictions>=0.5] = 1
audio_predictions[audio_predictions<0.5] = 0


video_predictions = video_model.predict([video_eval_x])
video_predictions[video_predictions>=0.5] = 1
video_predictions[video_predictions<0.5] = 0


from sklearn.metrics import hamming_loss
print("Audio Hamming Loss:",hamming_loss(eval_y,audio_predictions))
print("Video Hamming Loss:",hamming_loss(eval_y,video_predictions))

('Audio Hamming Loss:', 0.1458109559613319)
('Video Hamming Loss:', 0.17185821697099893)


### Somando os valores de cada predição

Será que as duas redes individuais combinadas se saem melhor?

Vamos salvar os vetores de predição de cada uma das redes separadas(esses valores saíram dos arquivos jupyter de video e audio)

Depois nós somamos as duas (ignorando valores acima de 1) e testamos as métricas para ver se está melhor que a multimodal

In [7]:
mix_audio_video_pred = np.clip(np.add(audio_predictions,video_predictions),0,1)
print("Hamming Loss:",hamming_loss(eval_y,mix_audio_video_pred))
from sklearn.metrics import  f1_score
print(" F1 Score:",f1_score(eval_y, mix_audio_video_pred,average='macro'))

predicted_diference = eval_y-mix_audio_video_pred
correcly_predicted = (np.where(~(predicted_diference).any(axis=1))[0])
print("Total of Exact match:",correcly_predicted.shape[0])
print("Accuracy of Exact match:", float(correcly_predicted.shape[0])/eval_y.shape[0])



('Hamming Loss:', 0.1683673469387755)
(' F1 Score:', 0.57436493541268763)
('Total of Exact match:', 160)
('Accuracy of Exact match:', 0.3007518796992481)


In [8]:
from utils import multilabel_confusion_matrix
multilabel_confusion_matrix(eval_y,mix_audio_video_pred,7)

Multilabel Confusion Matrix
  TP,   FP,     TN,     FN, 
0 128	112	247	45
1 26	18	461	27
2 65	55	385	27
3 54	130	340	8
4 37	28	444	23
5 33	46	431	22
6 115	39	331	47
Σ 458	428	2639	199

F1 Score: 0.593649
Recall: 0.697108
Precision: 0.516930
Hamming Loss: 0.168367
