In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
#import gc

import sys
sys.path.append('../volumes/')

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.utils import shuffle

from time import time, strftime, gmtime

from matplotlib import pyplot

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from prepare_data import *
from model_training import *

# Hyperparameters

In [None]:
import pickle
    
class ModelParameters:
    def __init__(self, create_dirs = True):
        self.LEARNING_RATE = 0.001
        self.DECAY = 0.0
        self.LOSS = 'mae'
        
        self.DEEPANDWIDE = True
        
        self.NRLAYERS = 3
        self.CELLS = [256] * self.NRLAYERS
        self.BATCHNORM = [False] * self.NRLAYERS
        self.DROPOUT = [0.5] * (self.NRLAYERS) 
        self.NORMALIZE = True
        self.GPUS = None

        self.EPOCHS = 5
        self.VERBOSE = 0
        self.BATCH_SIZE = 2048#512

        self.PCA = False
        
        self.FILENAME = '20200418'
        self.STORE_FOLDER = 'DeepAndWide'

        self.REDUCE_TRAIN = False
        self.LOAD_MODEL = False
        self.USE_ATR = True        
        
        self.update_folders()

        if create_dirs:
            self.create_dirs()
            
    def update_folders(self):
        self.PATH = './models/{}/'.format(self.STORE_FOLDER)
        self.RESULTS_PATH = './results/{}/'.format(self.STORE_FOLDER)
        self.FILECORE_PATH = self.PATH + self.FILENAME
        self.RESULTS_FILE = self.RESULTS_PATH + self.FILENAME
        
    def create_dirs(self):
        if not os.path.exists(self.PATH):
            os.makedirs(self.PATH)
        if not os.path.exists(self.RESULTS_PATH):
            os.makedirs(self.RESULTS_PATH)       

hp = ModelParameters()    

    
pickle.dump(hp, open('{}_params.p'.format(hp.FILECORE_PATH), 'wb'))

# Prepare data

In [None]:
columns = ['tmc', 'unix_ts', 'date', 'time', 'hour', 'dow',
           'tmc_linear', 'tmc_dir', 
           'miles',
           'frc', 'f_system', 'facil_type', 'thru_lanes',
           'aadt', 'aadt_single', 'aadt_combi', 
           'osm_highway', 'osm_lanes',
           'speed', 'ref_speed',
           'temp_f', 'dew_f', 'rel_humid', 'viz_mi', 'precip1hr_in', 
           'gps_pt1_wc1', 
           'gps_pt2_wc1', 'gps_pt2_wc2', 'gps_pt2_wc3', 
           'tmc_has_gps_data', 
           'count_type', 'count_subtype', 'count_location', 
           'count_total',
           #'atr_class_volume'
          ]

In [None]:
data_filename = "/hdd3/Volumes/MD2018/ML_data/20200417/MD-2018___CREATED_2020-04-13_142351___ML-INPUTS-COUNT-LOCATION-TMCs.csv"
data_skiprows = None

storefile = './tmpdata/2018.p'

t = time()

# If True data are generated from raw file. Otherwise read from storefile
if True: 
    print("Loading data...")
    df = pd.read_csv(data_filename, skiprows=data_skiprows)
    df = df[columns]    
        
        
    if True:
        print("Deleting weird CCSs counts...")
        df = delete_weird_counts(df, 3.0)

    if False:
        print ('Adding Hourly Averaged GPS counts....')
        df = add_hourly_averaged_gps_counts(df)
        
    print("Preparing data...                                           ")
    df = change_values(df)
    print ('Before prepare_df', df.shape)
    df = prepare_df(df)
    print ('Data shape after prepare_df:', df.shape)
    
    pickle.dump(df, open(storefile, "wb"))
    
else:
    df = pickle.load(open(storefile, "rb"))

    
if False: # Q1 data
    print ('All data:', df.shape)
    df = df[df.datetime < '2018-04-01']     
    print ('First Quarter:', df.shape)    

    
if False: # Q2-4-data    
    print ('All data:', df.shape)
    df = df[df.datetime >= '2018-04-01']     
    print ('Quarters 2-4:', df.shape)    
    pickle.dump(df, open(storefile_q234, "wb"))
    
print ("Data prepared in {:.1f} seconds.".format(time() - t))


# Learning (data splitted previously)

In [None]:
t = time()

print ("Train test split...")
df_train, df_test = split_train_test_fixed(df)

train_X, train_y = get_XY(df_train)
train_X, train_y = shuffle(train_X, train_y)   
test_X, test_y = get_XY(df_test)

print ('train_X shape:', train_X.shape)
print ('test_X shape:', test_X.shape)

print ("Data prepared in {:.1f} seconds.".format(time() - t))

In [None]:
hp.VERBOSE = 1

t = time()

if hp.NORMALIZE: 
    train_X, test_X = normalize(hp, train_X, test_X, save_scaler=True)
else:
    train_X = train_X.values
    test_X = test_X.values
    
sys.stdout.write('Preparing model...                 \r')

filepath = '{}.hdf5'.format(hp.PATH)

model, _ = create_model(hp, train_X.shape[1])

filepath_best = '{}_best.hdf5'.format(hp.PATH)
checkpoint = ModelCheckpoint(filepath_best, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]



print ('EPOCHS: {}                                    '.format(hp.EPOCHS))
print (model.summary())
print ("-----")

sys.stdout.write('Fitting model ...                \r')

history = model.fit(train_X, train_y, 
                    validation_data = (test_X, test_y),
                    batch_size = hp.BATCH_SIZE, 
                    epochs=hp.EPOCHS, 
                    #epochs=50,
                    verbose=hp.VERBOSE)

v_loss = history.history['val_loss']
t_loss = history.history['loss']


sys.stdout.write('Saving model...                \r')

model.save(filepath)

sys.stdout.write('Testing model...                \r')

pred = model.predict(test_X)

resdf, preddf = generate_results(df_test, pred, verbose=True)

plot_train_valid_test(hp, t_loss, v_loss)
    
print ('Done in {}'.format(strftime('%H:%M:%S', gmtime(time() - t))))    

In [None]:
print('Mean R2 {:.2f}, mean MAPE: {:.1f}%, mean SMAPE: {:.1f}%, mean EMFR: {:.3f}%'
      .format(
          np.mean(resdf.r2), np.mean(resdf.mape), np.mean(resdf.smape), np.mean(resdf.emfr)
      ))

print ('Median R2 {:.2f}, median MAPE: {:.1f}%, median SMAPE: {:.1f}%, median EMFR: {:.3f}%'
      .format(
          np.median(resdf.r2), np.median(resdf.mape), np.median(resdf.smape), np.median(resdf.emfr)
      ))

print_sumation(resdf)
plot_train_valid_test(hp, t_loss, v_loss)


In [None]:
resdf.sort_values("r2").head()
#resdf.sort_values("mape", ascending=False)


## Learning - Cross validation

In [None]:
resdf, preddf = train_model_cv(hp, df)

In [None]:
print_sumation(resdf)

In [None]:
save_results(hp, preddf, resdf, data_filename, data_skiprows)

In [None]:
resdf.sort_values("r2", ascending=True)

# Other stuff

## Testing

In [None]:
resdf, preddf = test_model_cv(hp, df)

In [None]:
print_sumation(resdf)

## Saving results

In [None]:
save_results(hp, preddf, resdf, data_filename, data_skiprows)

## Train model using all stations

In [None]:
hp.FILENAME = 'NoTestData'
hp.update_folders()

In [None]:
train_X, train_y = get_XY(df)
train_X, train_y = shuffle(train_X, train_y)       

In [None]:
hp.VERBOSE = 1
t = time()

if hp.NORMALIZE:
    train_X = normalize(train_X, test=None, save_scaler=True)
else:
    train_X = train_X.values
    
sys.stdout.write('Preparing model...                 \r')

filepath = '{}.hdf5'.format(hp.FILECORE_PATH)

model, _ = create_model(hp, train_X.shape[1])

print ('EPOCHS: {}                                    '.format(hp.EPOCHS))
print (model.summary())
print ("-----")

sys.stdout.write('Fitting model ...                \r')

history = model.fit(train_X, train_y, 
                    batch_size = hp.BATCH_SIZE, 
                    epochs=hp.EPOCHS, 
                    verbose=hp.VERBOSE)

t_loss = history.history['loss']


sys.stdout.write('Saving model...                \r')

model.save(filepath)

sys.stdout.write('Testing model...                \r')

plot_train_valid_test(t_loss, t_loss)
    
print ('Done in {}'.format(strftime('%H:%M:%S', gmtime(time() - t))))    