# Import Data

## Import the dataset

Import the dataset if you are in colab:

In [1]:
from google.colab import files
files.upload()
!ls

Saving completeDataset.csv to completeDataset.csv
completeDataset.csv  sample_data


Set the path to the csv file:

In [0]:
PATH_CSV = "completeDataset.csv"

## Function to import data

In [0]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

RANDOM_SHUFFLE_SEED = 0

#dataset
features_considered = ['IQ','pressure','wind_direction','wind_force','humidity','temperature']

DF = pd.read_csv(PATH_CSV, header=0, delimiter=';')
DF['date'] = pd.to_datetime(DF['date'],utc=True)


def importData(nb_prev_measures_for_predict):
    print("=====IMPORT=====")

    features = DF[features_considered]
    features.index = DF['date']

    dataset_test = features.values

    def higher_value(features,i):
        return[row[i] for row in dataset_test]

    max_pressure = max(higher_value(dataset_test,1))
    max_wind_force = max(higher_value(dataset_test,3))
    max_temperature = max(higher_value(dataset_test, 5))

    #normalize
    features['IQ'] = features['IQ'].apply(lambda x: x/10)
    features['pressure'] = features['pressure'].apply(lambda x: x/max_pressure)
    features['wind_force'] = features['wind_force'].apply(lambda x: x/max_wind_force)
    features['humidity'] = features['humidity'].apply(lambda x: x/100)
    features['temperature'] = features['temperature'].apply(lambda x: (x-273.15)/(max_temperature-273.15)) 

    #wind_direction to categorical
    features = pd.concat([features, pd.get_dummies(features['wind_direction'])], axis=1)
    features = features.drop(columns=["wind_direction"])

    x_train = []
    y_train = []
    countRow=0

    for indexRow, rowx in features.iterrows():
        # for each day we found with a value at 12:00
        if indexRow.hour == 12 and countRow >= nb_prev_measures_for_predict:
            try:
                # indexes for x (the range is inversed as our data are from the oldest to the newest)
                batchX = range(countRow, countRow - nb_prev_measures_for_predict, -1)
                # indexes for y
                batchY = [countRow+8,countRow+16,countRow+24]

                #application
                y_train.append(features.iloc[batchY]["IQ"].values)
                x_train.append(features.iloc[batchX].values)
            except:
                print("To long for ",indexRow)
        countRow+=1
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    x_train,y_train = shuffle(x_train,y_train, random_state=RANDOM_SHUFFLE_SEED)
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    y_train = y_train.reshape(y_train.shape[0],3,1)
    print("x_train :",x_train.shape)
    print("y_train :",y_train.shape)
    
    print("====END IMPORT====")
    return(x_train,y_train)

## Function to save the results

To save the results in a csv file:

In [4]:
!mkdir logs
!ls

completeDataset.csv  logs  sample_data


In [5]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [0]:
# code found here: https://stackoverflow.com/questions/42355122/can-i-export-a-tensorflow-summary-to-csv
import os
import numpy as np
import pandas as pd

import tensorflow as tf

from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


def tabulate_events(dpath):

    final_out = {}
    for dname in os.listdir(dpath):
        ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
        tags = ea.Tags()['scalars']

        out = {}

        for tag in tags:
            tag_values=[]
            wall_time=[]
            steps=[]

            for event in ea.Scalars(tag):
                tag_values.append(event.value)
                wall_time.append(event.wall_time)
                steps.append(event.step)

            out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])

        if len(tags)>0:      
            df= pd.concat(out.values(),keys=out.keys())
            print("- Done")
        else:
            print('- Not scalers to write')

        final_out[dname] = df


    return final_out

def saveProgressCSV():
  path = "logs/"
  steps = tabulate_events(path)
  pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')

In [7]:
print(os.getcwd())
print(os.listdir())

/content
['.config', 'logs', 'completeDataset.csv', 'sample_data']


# Test differents models

##Librairies:

In [8]:
from keras.optimizers import RMSprop,Adam
from keras.callbacks import TensorBoard,EarlyStopping
from time import time
from keras.models import Model
from keras.layers import LSTM, Dense, Input, GRU
from keras.models import save_model

Using TensorFlow backend.


## Parameters:

In [0]:
# PARAMS TO TEST
EPOCHS = 200
LR = 0.0005

NUMBER_OBSERVATIONS = [36,80]
GRU_HIDDEN_UNITS = [512,1024]
INTERMEDIATE_DENSE = [512,1024]
OPTIMIZER = {"RMS" : RMSprop(LR),"Adam":Adam(LR)}

## Main script

In [10]:
 for observations in NUMBER_OBSERVATIONS:
  x_train,y_train = importData(observations)
  trainLength = int(len(x_train)*0.8)
  x,x_val = x_train[:trainLength],x_train[trainLength:]
  y,y_val = y_train[:trainLength],y_train[trainLength:]
  for opti in OPTIMIZER:
    for hiddenUnit in GRU_HIDDEN_UNITS:
      for denseSize in INTERMEDIATE_DENSE:
        name = f"nobs{observations}-opti{opti}-lr{LR}-GRUhid{hiddenUnit}-dens{denseSize}-time{int(time())}"
        print(name)
        try:
          callbackName = str('logs/{}'.format(name))
          tensor_board = TensorBoard(callbackName)

          input_shape = (x_train.shape[-2],x_train.shape[-1])
          inp = Input(input_shape)
          d = GRU(hiddenUnit,input_shape=input_shape,name='LSTM_layer')(inp)
          if denseSize>0:
           d = Dense(denseSize,name="Intermediate_dense_layer")(d)
    
          outD1 = Dense(1,name="D1")(d)
          outD2 = Dense(1,name="D2")(d)
          outD3 = Dense(1,name="D3")(d)
    
          model = Model(inputs=[inp], outputs=[outD1, outD2, outD3])    
          model.compile(optimizer=OPTIMIZER[opti], loss={'D1': 'mse', 'D2': 'mse', 'D3': 'mse'}, metrics={'D1': 'mae', 'D2': 'mae', 'D3': 'mae'})
                    
          model.fit(x=x, y=[y[:,0],y[:,1],y[:,2]], validation_data=(x_val,[y_val[:,0],y_val[:,1],y_val[:,2]]),epochs=EPOCHS,callbacks = [tensor_board],verbose=0)

        except:
          print("Error: ",name)
      saveProgressCSV()

=====IMPORT=====


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

To long for  2020-02-27 12:00:00+00:00
To long for  2020-02-28 12:00:00+00:00
To long for  2020-02-29 12:00:00+00:00
x_train : (782, 36, 42)
y_train : (782, 3, 1)
====END IMPORT====
nobs36-optiRMS-lr0.0005-GRUhid512-dens512-time1585570821




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 625 samples, validate on 157 samples







Epoch 1/200
 - 4s - loss: 0.1451 - D1_loss: 0.0473 - D2_loss: 0.0544 - D3_loss: 0.0434 - D1_mean_absolute_error: 0.1602 - D2_mean_absolute_error: 0.1743 - D3_mean_absolute_error: 0.1544 - val_loss: 0.0767 - val_D1_loss: 0.0180 - val_D2_loss: 0.0200 - val_D3_loss: 0.0387 - val_D1_mean_absolute_error: 0.1047 - val_D2_mean_absolute_error: 0.1000 - val_D3_mean_absolute_error: 0.1751

Epoch 2/200
 - 1s - loss: 0.0721 - D1_loss: 0.0280 - D2_loss: 0.0212 - D3_loss: 0.0229 - D1_mean_absolute_error: 0.1301 - D2_mean_absolute_error: 0.1136 - D3_mean_absolute_error: 0.1181 - val_loss: 0.0525 - val_D1_loss: 0.02

KeyboardInterrupt: ignored

In [0]:
print("end!",time())
!ls logs/

In [0]:
from google.colab import files
files.download("all_result.csv")