<a href="https://colab.research.google.com/github/FG2511/ARE/blob/master/model1_cross_validation_postProcessing_FEDE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
@File name: model1_cross_validation.ipynb
@Created on 2018-12-20
@Authors: Federica Gerina, Francesca Moi, Silvia Maria Massa
@Description: Given a time-series dataset that contains minute-by-minute data 
about different kind of gases, collected by the uHoo air quality sensor, train
a NN that classifies if a minute belongs to the class "Pasto" (1) otherwise to
the class "Other" (0).
'''

!pip install liac-arff

import arff
import math

import numpy as np

from keras import optimizers
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout, LeakyReLU, BatchNormalization, Activation
from keras.callbacks import EarlyStopping

from sklearn.utils import compute_class_weight
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import sys
sys.path.append('local_modules')

import postprocessing_sw
#import postprocessing_sw_HL
import plotting
#import postprocessing_CO2



Using TensorFlow backend.


In [0]:
#fix random seed for reproducibility
seed = 5
np.random.seed(seed)

In [0]:
'''
@Description: generate a multilayer perceptron with LeakyRelu as activation
function.
@param: 
  - shape : int, the shape of the input
  - n_features: int, the number of features given
'''

#MODELLO 1
#REGOLA: input/2, input, 2*input, 1
#layers TUTTE LE FEATURE: 57, 113, 226, 1 
#layers TIME CO2 TEMP: 21, 41, 82, 1 
#layers TIME CO2 TEMP PM25/TVOC: 30, 59, 118, 1 
#layers TIME CO2 TEMP PM25 TVOC: 39, 77, 154, 1 

def generate_model_leaky(shape, n_features):
  
  units_1 = int(n_features/2)
  units_2 = n_features
  units_3 = n_features*2
  
  model = Sequential()
  model.add(BatchNormalization())
  
  model.add(Dense(units_1, input_dim=shape, kernel_initializer='random_uniform',  use_bias = False))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha = 0.2))
  model.add(Dropout(0.5))
  
  model.add(Dense(units_2, kernel_initializer='random_uniform',  use_bias = False))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha = 0.2))
  model.add(Dropout(0.5))
  
  model.add(Dense(units_3, kernel_initializer='random_uniform',  use_bias = False))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha = 0.2))
  model.add(Dropout(0.5))
  
  model.add(Dense(1, activation='sigmoid'))
  #print(model.summary())

  return model


'''
@Description: generate a multilayer perceptron with Relu as activation
function.
@param: 
  - shape : int, the shape of the input
  - n_features: int, the number of features given
'''

#MODELLO 2
#REGOLA: a= input, b= a*2/3+c, c= b*2/3+1
#layers TUTTE LE FEATURE: 113, 229, 153, 1
#layers TIME CO2 TEMP: 41, 85, 57, 1 
#layers TIME CO2 TEMP PM25/TVOC: 59, 121, 81, 1 
#layers TIME CO2 TEMP PM25 TVOC: 77, 157, 105, 1 

def generate_model(shape, n_features):
  
  a = np.array([[1,0,0],[-(2/3),1,-1],[0,-(2/3),1]])
  b = np.array([n_features,0,1])
  x = np.linalg.solve(a, b)
  
  units_1 = int(x[0])
  units_2 = int(x[1])
  units_3 = int(x[2])
  
  model = Sequential()
 
  model.add(Dense(units_1, input_dim=shape, kernel_initializer='random_uniform', use_bias = False))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(Dropout(0,5))
  
  model.add(Dense(units_2, kernel_initializer='random_uniform', use_bias = False))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(Dropout(0,5))
  
  model.add(Dense(units_3, kernel_initializer='random_uniform', use_bias = False))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(Dropout(0,5))
  
  model.add(Dense(1, activation='sigmoid'))
  #print(model.summary())

  return model

In [4]:
#@title SCEGLI I PARAMETRI

'''
@Description: MAIN
'''

#LOAD DATA
print("Loading data...")

dataset = '/root/data/uHooComplete_featureDataset.arff' #@param {type:"string"}

with open (dataset, encoding='utf-8') as f:
  dataDictionary = arff.load(f)

data = np.array(dataDictionary['data'])
print("DATASET LOADED")

#CONVERTING VALUES
print("\nConverting values...")
for i in data:
  if(i[-1] == 'Other'): i[-1] = 0
  elif(i[-1] == 'Pasto') : i[-1] = 1

dataset = data.astype('float32')
print("CONVERSION DONE")

#SPLIT INTO INPUT (X) AND OUTPUT (Y) VARIABLES
s = dataset.shape[-1]
X = dataset[:,0:s-1]
Y = dataset[:,s-1]

n_features = s-1

#OPTIMIZERS
adm = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

#LOSS
loss = 'binary_crossentropy'

#MODEL
modello = 1 #@param {type:"integer"}

#DEFINE K-FOLD CROSS-VALIDATION
fold = 10 #@param {type:"integer"}

kfold = KFold(n_splits=fold, shuffle=False, random_state=None)

cvscores = []
predictions = []
true_p =[]
true_n = []
test_time =[]
real = []

dimSplit = math.floor(len(dataset[:,0])/10)
startIndex = 0
finishIndex = dimSplit-1

i = 1

for train, test in kfold.split(X, Y):
  
  print("\nFOLD: %d" %i)
  
  #COMPUTE CLASS WEIGHT
  labels = np.unique(Y[train])
  classWeight = compute_class_weight('balanced', labels, Y[train])
  classWeight = dict(zip(labels,classWeight))

  #GENERATE MODEL
  print("\nGenerate model...")

  if modello==1 :
    model = generate_model_leaky(X[train].shape[-1], n_features)
  elif modello ==2:
    model = generate_model(X[train].shape[-1], n_features)

  #COMPILE MODEL
  print("\nCompile model...")
  model.compile(loss = loss, optimizer = adm , metrics=['accuracy'])

  #FIT MODEL
  print("\nFit model...")
  epoche = 7 #@param{type:"integer"}
  history = model.fit(X[train], Y[train], epochs=epoche, batch_size = 128, shuffle = True, verbose=1, class_weight = classWeight)

  #EVALUATE MODEL
  print("\nEvaluate model...")
  scores_test = model.evaluate(X[test], Y[test], batch_size= 128, verbose = 0)
  print("Test loss: %.2f%%" % (scores_test[0] * 100))
  print("Test accuracy: %.2f%%" % (scores_test[1] * 100))
  
  cvscores.append(scores_test[1] * 100)
  
  #CALCULATE PREDICTIONS
  print("\nCalculate predictions...")
  pred = model.predict_classes(X[test], batch_size=128, verbose=0)
  flat_pred = [item for sublist in pred for item in sublist]
  predictions.append(flat_pred)
  real.append(Y[test])
  
  #STORE DATETIME
  time = []  
  for j in X[test]:
    time.append(int(j[-5]))  
  test_time.append(time)
  
  #PLOTTING
  #plotting.plot_model_results(history)
  #plotting.plot_co2_temp_cross(flat_pred, dimSplit, startIndex, finishIndex, i)
  
  #CONFUSION MATRIX
  print("\nCompute confusion matrix...")
  y_true = Y[test]
  tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
  (tn, fp, fn, tp)
  other = 100*tn/(tn+fp)
  pasto = 100*tp/(fn+tp)
  true_p.append(pasto)
  true_n.append(other)
  print("Other: %.2f %%" % other)
  print("Pasto: %.2f %%" % pasto)
  
  i+=1

print("MEAN ACCURACY: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
print("MEAN TRUE POSITIVE RATE: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(true_p), np.std(true_p)))
print("MEAN TRUE NEGATIVE RATE: %.2f%% (STANDARD DEVIATION: +/- %.2f%%)" % (np.mean(true_n), np.std(true_n)))

#FLATTENING LISTS
flat_predictions = [item for sublist in predictions for item in sublist]
true_y = [arr.tolist() for arr in real]
flat_true_y = [item for sublist in true_y for item in sublist]
flat_time = [item for sublist in test_time for item in sublist]


Loading data...
DATASET LOADED

Converting values...
CONVERSION DONE
Instructions for updating:
Colocations handled automatically by placer.

FOLD: 1

Generate model...

Compile model...

Fit model...
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

Evaluate model...
Test loss: 37.61%
Test accuracy: 81.71%

Calculate predictions...

Compute confusion matrix...
Other: 82.23 %
Pasto: 69.23 %

FOLD: 2

Generate model...

Compile model...

Fit model...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

Evaluate model...
Test loss: 31.98%
Test accuracy: 85.12%

Calculate predictions...

Compute confusion matrix...
Other: 85.65 %
Pasto: 77.45 %

FOLD: 3

Generate model...

Compile model...

Fit model...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

Evaluate model...

In [0]:
def get_postprocessing_CO2_cv(pred):

  #Carico solamente i valori della CO2 e del datetime dell'insieme di test

  #LOAD SENSOR DATA
  datasetSensor = '/root/data/uHooComplete_featureDataset_Reduced.arff'

  with open (datasetSensor, encoding='utf-8') as fs:
    dataSensor = arff.load(fs)

  dataS = np.array(dataSensor['data'])

  #CONVERTING VALUES
  for i in dataS:
    if(i[-1] == 'Other'): i[-1] = 0
    else : i[-1] = 1

  #TEST SENSOR DATA  
  datastr = dataS[:,1]
  datastr2 = dataS[:,2]
  co2 = datastr.astype('float32')
  time = datastr2.astype('float32')

  #POST PROCESSING
  pasto = []
  index =[]
  p = []

  for i in pred:
    p.append(i)
    
  for i in range(0, len(pred)):
    if p[i] == 1 and i+1 in range(0, len(pred)) and time[i+1]>=time[i]:    
      index.append(i)
      pasto.append(co2[i])

      if (i+1) in range(0, len(pred)) and p[i+1] == 0: 
        if len(pasto) > 4:
          pasto = np.asarray(pasto)
          minimum = np.amin(pasto)
          maximum = np.amax(pasto)
          pasto = pasto.tolist()
          index_max = pasto.index(maximum)
          index_min = pasto.index(minimum)

          if len(pasto)<= 10:
            
            for j in range(index[index_max], index[-1]):
              if(j<index[-1]):
                p[j+3] = 0
                
          else:

            for j in range(index[index_max], index[-1]):
              if(j<index[-1]):
                p[j+10] = 0
              
              
        pasto=[]
        index=[]
        
  return p

In [75]:
new_pred = postprocessing_sw.sliding_windows(flat_predictions,5)

#new_new_pred = postprocessing_CO2.get_postprocessing_CO2_cv(new_pred)
new_new_pred = get_postprocessing_CO2_cv(new_pred)


FUNZIONE SLIDING WINDOWS...
Input: 350491
RISULTATO...
350491


In [76]:
#CONFUSION MATRIX
print("\nCompute confusion matrix BEFORE POST PROCESSING...")
tn, fp, fn, tp = confusion_matrix(flat_true_y, flat_predictions).ravel()
print("TN", tn)
print("FP", fp)
print("FN", fn)
print("TP", tp)
other = 100*tn/(tn+fp)
pasto = 100*tp/(fn+tp)
print("Other corretti: %.2f %%" % other)
print("Pasto corretti: %.2f %%" % pasto)

#CONFUSION MATRIX
print("\nCompute confusion matrix AFTER POST PROCESSING SW...")
tn, fp, fn, tp = confusion_matrix(flat_true_y, new_new_pred).ravel()
print("TN", tn)
print("FP", fp)
print("FN", fn)
print("TP", tp)
other = 100*tn/(tn+fp)
pasto = 100*tp/(fn+tp)
print("Other corretti: %.2f %%" % other)
print("Pasto corretti: %.2f %%" % pasto)


Compute confusion matrix BEFORE POST PROCESSING...
TN 289750
FP 44417
FN 3558
TP 12766
Other corretti: 86.71 %
Pasto corretti: 78.20 %

Compute confusion matrix AFTER POST PROCESSING SW...
TN 301901
FP 32266
FN 5710
TP 10614
Other corretti: 90.34 %
Pasto corretti: 65.02 %


In [0]:
#plotting.plot_co2_temp_complete_dataset(flat_predictions,new_pred)

In [0]:
#POST PROCESSING WITH SLIDING WINDOWS
#new_pred_1 = postprocessing_sw.sliding_windows(new_new_pred,5)
#new_pred_2 = postprocessing_sw.sliding_windows_time_colazione(new_pred_1, flat_time,5)
#new_pred_3 = postprocessing_sw.sliding_windows_time_pranzocena(new_pred_2, flat_time, 21, 21)
#new_pred_4 = postprocessing_sw.sliding_windows(new_pred_3,11)
#new_pred_5 = postprocessing_sw.sliding_windows_time_other(new_pred_4, flat_time, 21)

#new_pred = new_pred_2

In [68]:
#CONFUSION MATRIX
print("\nCompute confusion matrix BEFORE POST PROCESSING...")
tn, fp, fn, tp = confusion_matrix(flat_true_y, flat_predictions).ravel()
print("TN", tn)
print("FP", fp)
print("FN", fn)
print("TP", tp)
other = 100*tn/(tn+fp)
pasto = 100*tp/(fn+tp)
print("Other corretti: %.2f %%" % other)
print("Pasto corretti: %.2f %%" % pasto)

#CONFUSION MATRIX
print("\nCompute confusion matrix AFTER POST PROCESSING CO2...")
tn, fp, fn, tp = confusion_matrix(flat_true_y, new_new_pred).ravel()
print("TN", tn)
print("FP", fp)
print("FN", fn)
print("TP", tp)
other = 100*tn/(tn+fp)
pasto = 100*tp/(fn+tp)
print("Other corretti: %.2f %%" % other)
print("Pasto corretti: %.2f %%" % pasto)

#CONFUSION MATRIX
print("\nCompute confusion matrix AFTER POST PROCESSING SW...")
tn, fp, fn, tp = confusion_matrix(flat_true_y, new_pred).ravel()
print("TN", tn)
print("FP", fp)
print("FN", fn)
print("TP", tp)
other = 100*tn/(tn+fp)
pasto = 100*tp/(fn+tp)
print("Other corretti: %.2f %%" % other)
print("Pasto corretti: %.2f %%" % pasto)


#PLOTTING
#plotting.plot_co2_temp_complete_dataset(flat_predictions,new_pred)
#plotting.plot_co2_temp_complete_dataset(flat_predictions,new_new_pred)


Compute confusion matrix BEFORE POST PROCESSING...
TN 289750
FP 44417
FN 3558
TP 12766
Other corretti: 86.71 %
Pasto corretti: 78.20 %

Compute confusion matrix AFTER POST PROCESSING CO2...
TN 301901
FP 32266
FN 5710
TP 10614
Other corretti: 90.34 %
Pasto corretti: 65.02 %

Compute confusion matrix AFTER POST PROCESSING SW...
TN 302191
FP 31976
FN 5725
TP 10599
Other corretti: 90.43 %
Pasto corretti: 64.93 %


In [93]:
import statistics

#LOAD SENSOR DATA
datasetSensor = '/root/data/uHooComplete_featureDataset_Reduced.arff'

with open (datasetSensor, encoding='utf-8') as fs:
  dataSensor = arff.load(fs)

dataS = np.array(dataSensor['data'])

#CONVERTING VALUES
for i in dataS:
  if(i[-1] == 'Other'): i[-1] = 0
  else : i[-1] = 1

#TEST SENSOR DATA  
dataset = dataS.astype('float32')

co2 = []
diff1 = []
diff2 = []
diff3 = []
diff4 = []


for i in dataset:
  
  if(int(i[-1])==1):
    co2.append(i[1])
  else:
    if(len(co2)>=5 and len(co2)<=10):

      co2 = np.asarray(co2)
      maximum = np.amax(co2)
      co2 = co2.tolist()
      index_max = co2.index(maximum)
      diff1.append(len(co2[index_max:-1]))
      
      co2_bf_max = co2[0:index_max+1]
      
      co2_bf_max = np.asarray(co2_bf_max)
      minimum = np.amin(co2_bf_max)
      co2_bf_max = co2_bf_max.tolist()
      index_min = co2_bf_max.index(minimum)
      diff2.append(len(co2_bf_max[0:index_min+1]))
        
      del co2[:]
      
    elif(len(co2)>10):
      
      co2 = np.asarray(co2)
      maximum = np.amax(co2)
      co2 = co2.tolist()
      index_max = co2.index(maximum)
      diff3.append(len(co2[index_max:-1]))
      
      co2_bf_max = co2[0:index_max+1]
      
      co2_bf_max = np.asarray(co2_bf_max)
      minimum = np.amin(co2_bf_max)
      co2_bf_max = co2_bf_max.tolist()
      index_min = co2_bf_max.index(minimum)
      diff4.append(len(co2_bf_max[0:index_min+1]))
        
      del co2[:]
      

      
print("STATISTICHE PER I PASTI < 10 MIN")
print(len(diff1))
#print(list(diff1))
print(statistics.mean(diff1))

print(len(diff2))
#print(list(diff2))
print(statistics.mean(diff2))

print("STATISTICHE PER I PASTI > 10 MIN")
print(len(diff3))
#print(list(diff1))
print(statistics.mean(diff3))

print(len(diff4))
#print(list(diff2))
print(statistics.mean(diff4))

      
  

STATISTICHE PER I PASTI < 10 MIN
153
1.7973856209150327
153
1.3790849673202614
STATISTICHE PER I PASTI > 10 MIN
514
11.926070038910506
514
3.301556420233463


In [115]:
import statistics

#LOAD SENSOR DATA
datasetSensor = '/root/data/uHooComplete_featureDataset_Reduced.arff'

with open (datasetSensor, encoding='utf-8') as fs:
  dataSensor = arff.load(fs)

dataS = np.array(dataSensor['data'])

#CONVERTING VALUES
for i in dataS:
  if(i[-1] == 'Other'): i[-1] = 0
  else : i[-1] = 1

#TEST SENSOR DATA  
dataset = dataS.astype('float32')

co2 = []
diff1 = []
diff2 = []
diff3 = []
diff4 = []


for i,p in zip(dataset, new_pred):
  
  if(int(p==1)):
    co2.append(i[1])
  else:
    if(len(co2)>=5 and len(co2)<=10):

      co2 = np.asarray(co2)
      maximum = np.amax(co2)
      co2 = co2.tolist()
      index_max = co2.index(maximum)
      diff1.append(len(co2[index_max:-1]))
      
      co2_bf_max = co2[0:index_max+1]
      
      co2_bf_max = np.asarray(co2_bf_max)
      minimum = np.amin(co2_bf_max)
      co2_bf_max = co2_bf_max.tolist()
      index_min = co2_bf_max.index(minimum)
      diff2.append(len(co2_bf_max[0:index_min+1]))
        
      del co2[:]
      
    elif(len(co2)>10):
      
      co2 = np.asarray(co2)
      maximum = np.amax(co2)
      co2 = co2.tolist()
      index_max = co2.index(maximum)
      diff3.append(len(co2[index_max:-1]))
      
      co2_bf_max = co2[0:index_max+1]
      
      co2_bf_max = np.asarray(co2_bf_max)
      minimum = np.amin(co2_bf_max)
      co2_bf_max = co2_bf_max.tolist()
      index_min = co2_bf_max.index(minimum)
      diff4.append(len(co2_bf_max[0:index_min+1]))
        
      del co2[:]
      

      
print("STATISTICHE PER I PASTI < 10 MIN")
print(len(diff1))
#print(list(diff1))
print(statistics.mean(diff1))

print(len(diff2))
#print(list(diff2))
print(statistics.mean(diff2))

print("STATISTICHE PER I PASTI > 10 MIN")
print(len(diff3))
#print(list(diff1))
print(statistics.mean(diff3))

print(len(diff4))
#print(list(diff2))
print(statistics.mean(diff4))

STATISTICHE PER I PASTI < 10 MIN
328
3.2347560975609757
328
1.7774390243902438
STATISTICHE PER I PASTI > 10 MIN
1268
18.593059936908517
1268
4.8454258675078865
