In [1]:
################################
#### IMPORTS AND INICIALIZATIONS
################################

import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from matplotlib import pyplot as plt
import math
from keras import layers as l
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Reshape, Conv2DTranspose, BatchNormalization, Dropout, Flatten, concatenate
from keras.models import Model
from keras.utils import plot_model, get_file
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import re

data = 'RAVEN/TrialRightWrong/Trial'

# All the files we need
files = ['/DEI_trial_by_trial_Right.xlsx', 
         '/DEI_trial_by_trial_Wrong.xlsx', 
         '/esec_trial_by_trial_Right.xlsx',
         '/esec_trial_by_trial_Wrong.xlsx']
genreFile = 'RAVEN/Informação_género.txt'

# Initialize a dict for aggregate the values for training and testing
values = { 'TRAINING': {}, 'TESTING': {} }

Using TensorFlow backend.


In [2]:
####################################
#### EXTRACT DATA AND PRE-PROCESSING
####################################

for file in files:
    
    # For each file
    f = pd.ExcelFile(data+file)

    # For each Train
    for sheet in f.sheet_names:
        
        df = pd.read_excel(f,  sheet_name=sheet)
        
        category = 'TRAINING' if 'TRAINING' in sheet \
                                else 'TESTING' \
                                    if 'TESTING' in sheet \
                                        else None  
        
        # Get all values for each person
        # In the 'unnamed:22' column is where person id is
        # Notice that if a person isn't in the 'right files', so he is in the 'wrong files' 
        if category and df.shape[0] > 0:
            dataValues = values[category]

            for index, row in df[df.columns].iterrows():
                # cell = [genre, course, value_each_characteristic, correct(1)/incorret(0)]
                *rest,id = row 
                cell = [int("DEI" in id)] #[int(row[0]=="DEI"),sheet, row[1]]
                cell += rest
                cell += [1 if (file=='/DEI_trial_by_trial_Right.xlsx' or file=='/esec_trial_by_trial_Right.xlsx') else 0]
     
                if (id not in values):
                    dataValues[id] = [cell]
                else:
                    dataValues[id] += [cell] 
                    
# Extract the genre from genreFile
with open(genreFile, "r") as f:
    
    # For each file get the information about person_id, genre and the number of correct answers in the trials.
    for l in f:
        l = l.strip()
        if l=="" or '--' in l:continue
        id, genre, rest = l.split(' - ')
        
        correct = int(re.search(r'\d+', rest).group())
        if id[-2] == '_': id = id[:-1] + '0' + id[-1]
        for dataDict in [dataValues['TRAINING'], dataValues['TESTING']]: # Get dictionaries 'train and test'
            for i in range(len(dataDict[id])):
                dataDict[id][i][:0] = [int(genre=='Masculino')] # Add the genre in the begining of the cell  
                dataDict[id][i][-1:-1] = [correct] # Add the number of right answers 

    

# For each person ( PERSON : list([genre, course, characteristic_value, correct/incorrect ]))        
items = { 'TRAINING': [], 'TESTING': [] }
for dataType in ['TRAINING','TESTING']:
    for v in values[dataType].values():
        items[dataType] += v
items['TRAINING'] = np.matrix(items['TRAINING'])
items['TESTING'] = np.matrix(items['TESTING'])

Itrain = items['TRAINING']   # Changing missing values by the average
col_mean = np.nanmean(Itrain, axis=0)
inds = np.where(np.isnan(Itrain))
Itrain[inds] = np.take(col_mean, inds[1])

scaler = MinMaxScaler()
scaler.fit(Itrain[:,:-1])

Itest = items['TESTING']   # Changing missing values by the average
col_mean = np.nanmean(Itest, axis=0)
inds = np.where(np.isnan(Itest))
Itest[inds] = np.take(col_mean, inds[1])

data_trainX = scaler.transform(Itrain[:,:-1])
data_trainY = Itrain[:,-1]

data_testX = scaler.transform(Itest[:,:-1])
data_testY = Itest[:,-1] 

KeyError: 'TRAINING'

In [None]:
################################
########################### PCA
################################

pca = PCA(n_components=2)
pcaData = pca.fit_transform(data_trainX[:,2:-1])

plt.scatter(pcaData[:,0], pcaData[:,1])

plt.show()

In [None]:
################################
################ NEURAL NETWORK
################################

inputs = Input(shape=(data_trainX.shape[1],))
x = Dense(32, activation='relu')(inputs)
x = Dense(8, activation='relu')(x)
x = Dense(8, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

model.fit(data_trainX, data_trainY, epochs=16, batch_size=4)
predicts = model.predict(data_testX)>0.5
print('accuracy:',(predicts==data_testY).sum()/predicts.shape[0])



In [None]:
################################
####### NORMALIZE AND AGGREGATE
################################

avg = []
course_genre = []
correct = []

group = dataX[:, 0]+dataX[:, 1]*2
cg, count = np.unique(group, return_counts=True)

for t,c in zip(cg,count):
    validClass = (group == t) #True if it belongs to group genre/course
    fadigue = validClass * data_trainX[:,2]

    correctAnswers = validClass.T * data_trainY
    percCorrectAnswers = correctAnswers.sum()/c
    correct.append(percCorrectAnswers*100)

    avg.append(fadigue.sum()/c)
    course_genre.append(t)

print('Course with Genre:',course_genre, ', Avg:', avg)


In [None]:
################################
#### PLOT INFO 
################################

# Fatigue Values for the each course and each genre
%matplotlib inline

x = ['ESEC AND FEMIN','ESEC AND MASC', 'DEI AND FEMIN', 'DEI AND MASC']
y = course_genre
plt.figure(figsize=(15,12))
plt.bar(x,y)
plt.title("Fatigue Values for the each course and each genre")
plt.xlabel('Each course with each genre')
plt.xticks(fontsize=9 , rotation=30)
plt.ylabel('Mean Fatigue Value')
plt.show()

# % of correct answers for the each course and each genre

plt.figure(figsize=(15,12))
y = correct
plt.bar(x,y)
plt.title("% of correct answers for the each course and each genre")
plt.xlabel('Each course with each genre')
plt.xticks(fontsize=9 , rotation=30)
plt.ylabel('% of correct answers')
plt.show()