<a href="https://colab.research.google.com/github/FG2511/ARE/blob/master/model1_cross_validation_con_analisi_errori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
@File name: model1_cross_validation.ipynb
@Created on 2018-12-20
@Authors: Federica Gerina, Francesca Moi, Silvia Maria Massa
@Description: Given a time-series dataset that contains minute-by-minute data 
about different kind of gases, collected by the uHoo air quality sensor, train
a NN that classifies if a minute belongs to the class "Pasto" (1) otherwise to
the class "Other" (0).
'''

!pip install liac-arff

import arff
import math
import numpy as np
from keras import optimizers
from sklearn.utils import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

import sys
sys.path.append('local_modules')

import mlp
import postprocessing_sw
import cooking_inst_mod
import utils

#fix random seed for reproducibility
seed = 5
np.random.seed(seed)

Collecting liac-arff
  Downloading https://files.pythonhosted.org/packages/e9/35/fbc9217cfa91d98888b43e1a19c03a50d716108c58494c558c65e308f372/liac-arff-2.4.0.tar.gz
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/d1/6a/e7/529dc54d76ecede4346164a09ae3168df358945612710f5203
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.4.0


Using TensorFlow backend.


In [2]:
#@title CHOOSE

'''
@Description: MAIN
'''

#LOAD DATA
print("Loading data...")

dataset = '/root/data/uHooComplete_featureDataset_Past_Only.arff' #@param {type:"string"}

with open (dataset, encoding='utf-8') as f:
  dataDictionary = arff.load(f)

data = np.array(dataDictionary['data'])
print("DATASET LOADED")

#CONVERTING VALUES
print("\nConverting values...")
for i in data:
  if(i[-1] == 'Other'): i[-1] = 0
  elif(i[-1] == 'Pasto') : i[-1] = 1

dataset = data.astype('float32')
print("CONVERSION DONE")

#SPLIT INTO INPUT (X) AND OUTPUT (Y) VARIABLES
s = dataset.shape[-1]
X = dataset[:,0:s-1]
Y = dataset[:,s-1]

n_features = s-1

#OPTIMIZERS
adm = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

#LOSS
loss = 'binary_crossentropy'

#DEFINE K-FOLD CROSS-VALIDATION
fold = 10 #@param {type:"integer"}

kfold = KFold(n_splits=fold, shuffle=False, random_state=None)

cvscores = []
predictions = []

true_p =[]
true_n = []
precision = []
fscore= []

test_time =[]
real = []

dimSplit = math.floor(len(dataset[:,0])/10)
startIndex = 0
finishIndex = dimSplit-1

i = 1

for train, test in kfold.split(X, Y):
  
  print("\nFOLD: %d" %i)
  
  #COMPUTE CLASS WEIGHT
  labels = np.unique(Y[train])
  classWeight = compute_class_weight('balanced', labels, Y[train])
  classWeight = dict(zip(labels,classWeight))

  #GENERATE MODEL
  print("\nGenerate model...")

  model = mlp.generate_model_leaky(X[train].shape[-1], n_features)

  #COMPILE MODEL
  print("\nCompile model...")
  model.compile(loss = loss, optimizer = adm , metrics=['accuracy'])

  #FIT MODEL
  print("\nFit model...")
  epochs = 8 #@param{type:"integer"}
  history = model.fit(X[train], Y[train], epochs=epochs, batch_size = 128, shuffle = True, verbose=1, class_weight = classWeight)

  #EVALUATE MODEL
  print("\nEvaluate model...")
  scores_test = model.evaluate(X[test], Y[test], batch_size= 128, verbose = 0)
  print("Test loss: %.2f%%" % (scores_test[0] * 100))
  print("Test accuracy: %.2f%%" % (scores_test[1] * 100))
  
  cvscores.append(scores_test[1] * 100)
  
  #CALCULATE PREDICTIONS
  print("\nCalculate predictions...")
  pred = model.predict_classes(X[test], batch_size=128, verbose=0)
  flat_pred = [item for sublist in pred for item in sublist]
  predictions.append(flat_pred)
  real.append(Y[test])
  
  #STORE DATETIME
  time = []  
  for j in X[test]:
    time.append(int(j[-5]))  
  test_time.append(time)
  
  #CONFUSION MATRIX AND METRICS FOR MEAN EVALUATION
  tn, fp, fn, tp = confusion_matrix(Y[test], flat_pred).ravel()
  tnr = 100*tn/(tn+fp)
  tpr = 100*tp/(fn+tp)
  prec = 100*(tp/(tp+fp))
  F1 = 2*((prec*tpr)/(prec+tpr))
  
  true_p.append(tpr)
  true_n.append(tnr)
  precision.append(prec)
  fscore.append(F1)
  
  i+=1

print("\n\n")
print("MEAN ACCURACY: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
print("MEAN TRUE POSITIVE RATE: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(true_p), np.std(true_p)))
print("MEAN TRUE NEGATIVE RATE: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(true_n), np.std(true_n)))
print("MEAN PRECISION: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(precision), np.std(precision)))
print("MEAN F1 SCORE: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(fscore), np.std(fscore)))

#FLATTENING LISTS
flat_predictions = [item for sublist in predictions for item in sublist]
true_y = [arr.tolist() for arr in real]
flat_true_y = [item for sublist in true_y for item in sublist]
flat_time = [item for sublist in test_time for item in sublist]


#CONFUSION MATRIX AND METRICS BEFORE POST PROCESSING
print("\n\nCompute confusion matrix and metrics BEFORE POST PROCESSING...")
utils.compute_metrics(flat_true_y, flat_predictions)


Loading data...
DATASET LOADED

Converting values...
CONVERSION DONE
Instructions for updating:
Colocations handled automatically by placer.

FOLD: 1

Generate model...

Compile model...

Fit model...
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

Evaluate model...
Test loss: 47.70%
Test accuracy: 71.16%

Calculate predictions...

FOLD: 2

Generate model...

Compile model...

Fit model...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

Evaluate model...
Test loss: 37.48%
Test accuracy: 81.24%

Calculate predictions...

FOLD: 3

Generate model...

Compile model...

Fit model...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

Evaluate model...
Test loss: 31.92%
Test accuracy: 87.63%

Calculate predictions...

FOLD: 4

Generate mo

In [3]:
#POST PROCESSING WITH SLIDING WINDOWS (MINUTE BY MINUTE)
new_pred =  postprocessing_sw.sliding_windows(flat_predictions,35)

#CONFUSION MATRIX AND METRICS AFTER POST PROCESSING
print("\n\nCompute confusion matrix and metrics AFTER POST PROCESSING...")
utils.compute_metrics(flat_true_y, new_pred)


SLIDING WINDOWS FUNCTION...


Compute confusion matrix and metrics AFTER POST PROCESSING...
TN 288369
FP 45798
FN 4969
TP 11355
ACCURACY: 85.52 %
TRUE NEGATIVE RATE (SPECIFICITY): 86.29 %
TRUE POSITIVE RATE (RECALL): 69.56 %
PRECISION: 19.87 %
F1 SCORE: 30.91 %


In [4]:
#COOKING INSTANCE MODALITY

#before post-processing
print("\nCOOKING INSTANCE MODALITY BEFORE POST PROCESSING")
cooking_inst_mod.get_precision_recall_f1(flat_predictions,flat_true_y)
#after post-processing
print("\nCOOKING INSTANCE MODALITY AFTER POST PROCESSING")
cooking_inst_mod.get_precision_recall_f1(new_pred,flat_true_y)


COOKING INSTANCE MODALITY BEFORE POST PROCESSING
N° pasti reali: 669
N° pasti predetti: 1951
TP: 470
FP: 1422
FN: 199
Recall: 70.25 %
Precision: 24.84 %
F1 score: 36.70 %

COOKING INSTANCE MODALITY AFTER POST PROCESSING
N° pasti reali: 669
N° pasti predetti: 954
TP: 444
FP: 556
FN: 225
Recall: 66.37 %
Precision: 44.40 %
F1 score: 53.21 %


In [0]:
import csv

def create_pred_csv(pred):
  
  #LOAD SENSOR DATA
  datasetSensor = '/root/data/uHooComplete_featureDataset_Reduced.arff'

  with open (datasetSensor, encoding='utf-8') as fs:
    dataSensor = arff.load(fs)

  dataS = np.array(dataSensor['data'])

  #CONVERTING VALUES
  for i in dataS:
    if(i[-1] == 'Other'): i[-1] = 0
    else : i[-1] = 1

  new_rows = []

  for i,z in zip(dataS, pred):
    new_dict = {}
    new_dict['Datetime'] = i[2]
    new_dict['Actual'] = i[-1]
    new_dict['Predicted'] = z

    new_rows.append(new_dict)


  keys = new_rows[0].keys()


  with open("/root/data/out_pred.csv", "w", newline='') as o:
    w = csv.DictWriter(o, keys)
    w.writeheader()
    w.writerows(new_rows)

In [0]:
create_pred_csv(new_pred)

In [0]:
import csv 

fieldnames = []

with open ("/root/data/out_pred.csv", "r", newline="") as f_in:
  reader = csv.reader(f_in)
  headers = next(reader)
  for h in headers:
      if h not in fieldnames:
          fieldnames.append(h)
          
with open ("/root/data/misclassified_pasto.csv", "w", newline="") as out_p, open("/root/data/misclassified_other.csv", "w", newline="") as out_o:
  writer_1 = csv.writer(out_p)
  writer_1.writerow(fieldnames)

  writer_2 = csv.writer(out_o)
  writer_2.writerow(fieldnames) 
  
with open("/root/data/misclassified_pasto.csv", "a", newline='') as out_f_1 , open("/root/data/misclassified_other.csv", "a", newline='') as out_f_2:
  writer_1 = csv.DictWriter(out_f_1, fieldnames=fieldnames)
  writer_2 = csv.DictWriter(out_f_2, fieldnames=fieldnames)

  with open("/root/data/out_pred.csv", "r", newline="") as f_in:
      reader = csv.DictReader(f_in)

      for row in reader:
          predicted = int(row.get('Predicted'))
          actual = int(row.get('Actual'))
          minute = float(row.get('Datetime'))
          #print(predicted, actual, minute)
          
          if(actual == 1 and predicted == 0):
              writer_1.writerow(row)
          elif(actual == 0 and predicted==1):
              writer_2.writerow(row)
              #print("here")
              

In [0]:
pos_mis_pasto = []
pos_mis_other = []

with open("/root/data/misclassified_pasto.csv", "r", newline='') as f1:
  r1 = csv.DictReader(f1)

  for row in r1:
      minute = float(row.get('Datetime'))

      if minute in range (0,360): #dalle 00:00 alle 05:00
          pos_mis_pasto.append(['Errore tra le 00:00 e le 05:00', minute])
      elif minute in range (360,601): #dalle 06:00 alle 10:00
          pos_mis_pasto.append(['Errore tra le 06:00 e le 10:00 - COLAZIONE', minute])
      elif minute in range (601,720): #dalle 10:01 alle 11:59
          pos_mis_pasto.append(['Errore tra le 10:01 e le 11:59', minute])
      elif minute in range (720,901): #dalle 12:00 alle 15:00
          pos_mis_pasto.append(['Errore tra le 12:00 e le 15:00 - PRANZO', minute])
      elif minute in range (901,1140): #dalle 15:01 alle 18:59
          pos_mis_pasto.append(['Errore tra le 15:01 e le 18:59', minute])
      elif minute in range (1140,1321): #dalle 19:00 alle 22:00
          pos_mis_pasto.append(['Errore tra le 19:00 e le 22:00 - CENA', minute])
      elif minute in range (1321,1440): #dalle 22:01 alle 23:59
          pos_mis_pasto.append(['Errore tra le 22:01 e le 23:59', minute])
          

with open("/root/data/misclassified_other.csv", "r", newline='') as f2:
  r2 = csv.DictReader(f2)

  for row in r2:
      minute = float(row.get('Datetime'))
      
      if minute in range (0,360): #dalle 00:00 alle 05:59
          pos_mis_other.append(['Errore tra le 00:00 e le 05:59', minute])
      elif minute in range (360,601): #dalle 06:00 alle 10:00
          pos_mis_other.append(['Errore tra le 06:00 e le 10:00 - COLAZIONE', minute])
      elif minute in range (601,720): #dalle 10:01 alle 11:59
          pos_mis_other.append(['Errore tra le 10:01 e le 11:59', minute])
      elif minute in range (720,901): #dalle 12:00 alle 15:00
          pos_mis_other.append(['Errore tra le 12:00 e le 15:00 - PRANZO', minute])
      elif minute in range (901,1140): #dalle 15:01 alle 18:59
          pos_mis_other.append(['Errore tra le 15:01 e le 18:59', minute])
      elif minute in range (1140,1321): #dalle 19:00 alle 22:00
          pos_mis_other.append(['Errore tra le 19:00 e le 22:00 - CENA', minute])
      elif minute in range (1321,1440): #dalle 22:01 alle 23:59
          pos_mis_other.append(['Errore tra le 22:01 e le 23:59', minute])
            

c_1 = 0
c_2 = 0
c_3 = 0
c_4 = 0
c_5 = 0
c_6 = 0
c_7 = 0

for i in pos_mis_pasto:
    if (i[0] == 'Errore tra le 00:00 e le 05:59'):
        c_1 += 1
    elif (i[0] == 'Errore tra le 06:00 e le 10:00 - COLAZIONE'):
        c_2 += 1
    elif (i[0] == 'Errore tra le 10:01 e le 11:59'):
        c_3 += 1
    elif (i[0] == 'Errore tra le 12:00 e le 15:00 - PRANZO'):
        c_4 += 1
    elif (i[0] == 'Errore tra le 15:01 e le 18:59'):
        c_5 += 1
    elif (i[0] == 'Errore tra le 19:00 e le 22:00 - CENA'):
        c_6 += 1
    elif (i[0] == 'Errore tra le 22:01 e le 23:59'):
        c_7 += 1

with open("/root/data/misclassified_pasto.txt", "w", newline='') as out_1:
    out_1.write("ACTUAL CLASS: PASTO \t PREDICTED CLASS: OTHER\r\n\n")
    out_1.write("MISCLASSIFIED PASTO DALLE 06:00 ALLE 10:00 - VICINO ALLA COLAZIONE: %d\r\n" %c_2)
    out_1.write("MISCLASSIFIED PASTO DALLE 12:00 ALLE 15:00 - VICINO AL PRANZO: %d\r\n" %c_4)
    out_1.write("MISCLASSIFIED PASTO DALLE 19:00 ALLE 22:00 - VICINO ALLA CENA: %d\r\n\n" %c_6)
    out_1.write("MISCLASSIFIED PASTO DALLE 00:00 ALLE 05:59: %d\r\n" %c_1)
    out_1.write("MISCLASSIFIED PASTO DALLE 10:01 ALLE 11:59: %d\r\n" %c_3)
    out_1.write("MISCLASSIFIED PASTO DALLE 15:01 ALLE 18:59: %d\r\n" %c_5)
    out_1.write("MISCLASSIFIED PASTO DALLE 22:01 ALLE 23:59: %d\r\n\n" %c_7)
    out_1.write("TOTAL MISCLASSIFIED PASTO: %d\r\n" %(c_1+c_2+c_3+c_4+c_5+c_6+c_7))


c_1 = 0
c_2 = 0
c_3 = 0
c_4 = 0
c_5 = 0
c_6 = 0
c_7 = 0

for i in pos_mis_other:
    if (i[0] == 'Errore tra le 00:00 e le 05:59'):
        c_1 += 1
    elif (i[0] == 'Errore tra le 06:00 e le 10:00 - COLAZIONE'):
        c_2 += 1
    elif (i[0] == 'Errore tra le 10:01 e le 11:59'):
        c_3 += 1
    elif (i[0] == 'Errore tra le 12:00 e le 15:00 - PRANZO'):
        c_4 += 1
    elif (i[0] == 'Errore tra le 15:01 e le 18:59'):
        c_5 += 1
    elif (i[0] == 'Errore tra le 19:00 e le 22:00 - CENA'):
        c_6 += 1
    elif (i[0] == 'Errore tra le 22:01 e le 23:59'):
        c_7 += 1

with open("/root/data/misclassified_other.txt", "w", newline='') as out_2:
    out_2.write("ACTUAL CLASS: OTHER \t PREDICTED CLASS: PASTO\r\n\n")
    out_2.write("MISCLASSIFIED OTHER DALLE 06:00 ALLE 10:00 - VICINO ALLA COLAZIONE: %d\r\n" %c_2)
    out_2.write("MISCLASSIFIED OTHER DALLE 12:00 ALLE 15:00 - VICINO AL PRANZO: %d\r\n" %c_4)
    out_2.write("MISCLASSIFIED OTHER DALLE 19:00 ALLE 22:00 - VICINO ALLA CENA: %d\r\n\n" %c_6)
    out_2.write("MISCLASSIFIED OTHER DALLE 00:00 ALLE 05:59: %d\r\n" %c_1)
    out_2.write("MISCLASSIFIED OTHER DALLE 10:01 ALLE 11:59: %d\r\n" %c_3)
    out_2.write("MISCLASSIFIED OTHER DALLE 15:01 ALLE 18:59: %d\r\n" %c_5)
    out_2.write("MISCLASSIFIED OTHER DALLE 22:01 ALLE 23:59: %d\r\n\n" %c_7)
    out_2.write("TOTAL MISCLASSIFIED OTHER: %d\r\n" %(c_1+c_2+c_3+c_4+c_5+c_6+c_7))