# Neural networks 

## Notebook settings

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from scipy.stats import describe
import seaborn as sns
from pylab import rcParams
#from model import *
from prediction import *
from data_processing import *
from utils import *

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


# merge flow and pressure data

In [4]:
f_data = pd.read_csv('../data/flow_200.csv',parse_dates=True)
f_data = f_data.drop(["webid"], axis=1)
f_data.columns = ['fk_network_meter', "timestamp",'flow_value_raw', 'flow_value_clean']

p_data = pd.read_csv('../data/pressure_200.csv',parse_dates=True)
p_data = p_data.drop(["webid"], axis=1)
p_data.columns = ['fk_network_meter', "timestamp",'pressure_value_raw', 'pressure_value_clean']
data = pd.merge(f_data, p_data, how="inner", on=["fk_network_meter","timestamp"])
data= data.drop(["flow_value_raw","pressure_value_raw" ], axis=1)

# genearting time features

In [5]:
df=data.copy()
logger_ID= data.fk_network_meter.unique()
df.index = pd.to_datetime(df['timestamp']).sort_values()
df.drop('timestamp', axis=1, inplace=True)
df = sales_by_storeitem(df)

timeofday=(4*df.index.hour + df.index.minute/15 ).astype(int)
timeofday_df = pd.get_dummies(timeofday, prefix='timeofday')
timeofday_df.index = df.index

weekday_df = pd.get_dummies(df.index.dayofweek, prefix='dayofweek')
weekday_df.index = df.index

month_df = pd.get_dummies(df.index.month, prefix='month')
month_df.index =  df.index

df_total = pd.concat([timeofday_df, weekday_df, month_df, df], axis=1)

In [6]:
df_total.head()

Unnamed: 0_level_0,timeofday_0,timeofday_1,timeofday_2,timeofday_3,timeofday_4,timeofday_5,timeofday_6,timeofday_7,timeofday_8,timeofday_9,...,logger_DM04357_flow,logger_DM04357_pressure,logger_DM01051_flow,logger_DM01051_pressure,logger_DM00871_flow,logger_DM00871_pressure,logger_DM04345_flow,logger_DM04345_pressure,logger_DM04305_flow,logger_DM04305_pressure
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-13 13:00:00,0,0,0,0,0,0,0,0,0,0,...,2.34677,50.84835,6.451262,35.053207,0.0,57.715015,6.605722,73.87113,2.45882,52.808895
2019-02-13 13:15:00,0,0,0,0,0,0,0,0,0,0,...,2.938961,47.445274,10.84792,34.347572,0.0,57.419304,13.343393,68.17918,0.515221,53.373135
2019-02-13 13:30:00,0,0,0,0,0,0,0,0,0,0,...,2.434211,49.137295,14.203143,35.55082,0.0,56.46079,14.070147,66.267586,-0.15708,56.19499
2019-02-13 13:45:00,0,0,0,0,0,0,0,0,0,0,...,2.295981,51.29227,9.057214,34.867615,0.0,56.79729,5.564808,73.933685,1.223127,59.054226
2019-02-13 14:00:00,0,0,0,0,0,0,0,0,0,0,...,2.624801,51.598858,4.354248,34.627647,0.0,57.990337,13.885842,66.51978,0.862891,60.89716


# generating lag features, the previous 4 reading(1 hour) are used as input features

In [11]:
df_total = stack_shifted_sales(df_total, logger_ID, days_deltas=[1,2,3,4])
df_total.head()
#sys.exit()
#df_total.dropna(inplace=True)
sales_cols = [col for col in df_total.columns if '_flow' in col and '_flow_' not in col or  '_pressure'  in col and '_pressure_' not in col]

stacked_sales_cols = [col for col in df_total.columns if '_flow_' in col or '_pressure_' in col]
other_cols = [col for col in df_total.columns if col not in set(sales_cols) and col not in set(stacked_sales_cols)]
sales_cols = sorted(sales_cols)
stacked_sales_cols = sorted(stacked_sales_cols)

new_cols = other_cols + stacked_sales_cols + sales_cols
df_total = df_total.reindex(columns=new_cols)


In [13]:
df_total.head()

Unnamed: 0_level_0,timeofday_0,timeofday_1,timeofday_2,timeofday_3,timeofday_4,timeofday_5,timeofday_6,timeofday_7,timeofday_8,timeofday_9,...,logger_ZM00503_flow,logger_ZM00503_pressure,logger_ZM02057_flow,logger_ZM02057_pressure,logger_ZM10672_flow,logger_ZM10672_pressure,logger_ZM30356_flow,logger_ZM30356_pressure,logger_ZM30424_flow,logger_ZM30424_pressure
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-13 13:00:00,0,0,0,0,0,0,0,0,0,0,...,0.228985,76.567986,1.909451,,-0.9989,-6.493,3.6033,40.1365,12.62,84.78
2019-02-13 13:15:00,0,0,0,0,0,0,0,0,0,0,...,0.208592,76.23226,2.945934,,-0.9989,-6.493,3.7153,40.6354,12.553333,86.93
2019-02-13 13:30:00,0,0,0,0,0,0,0,0,0,0,...,0.226009,77.1555,3.953406,,-0.9989,-6.493,3.6954,41.0089,12.913333,86.933334
2019-02-13 13:45:00,0,0,0,0,0,0,0,0,0,0,...,0.229095,76.60614,3.278242,,-0.9989,-6.493,3.6248,40.3124,12.366667,82.44
2019-02-13 14:00:00,0,0,0,0,0,0,0,0,0,0,...,0.210636,77.72013,1.64044,,-0.9989,-6.493,2.8439,28.9897,12.19,82.263336


# scale feature to range (0,1) according to individual loggerID

In [14]:
#assert df_total.isna().any().any() == False
scaler = MinMaxScaler(feature_range=(0,1))
cols_to_scale = [col for col in df_total.columns if 'timeofday' not in col and 'dayofweek' not in col and 'month' not in col]
#print(df_total[cols_to_scale])
scaled_cols = scaler.fit_transform(df_total[cols_to_scale])
df_total[cols_to_scale] = scaled_cols
#df_train = df_total[df_total['is_test'] == False].drop('is_test', axis=1)


In [15]:
X_cols_stacked = [col for col in df_total.columns if '_past_' in col]
X_cols_caldata = [col for col in df_total.columns if 'timeofday_' in col or 'dayofweek_' in col or 'month_' in col or 'year' in col]
X_cols_pressure = [col for col in df_total.columns if  '_pressure'  in col and '_pressure_' not in col]
X_cols = X_cols_stacked + X_cols_caldata + X_cols_pressure

In [16]:
X = df_total[X_cols]

y_cols = [col for col in df_total.columns if col not in X_cols]

y = df_total[y_cols]

In [17]:
y.head()

Unnamed: 0_level_0,logger_DM00030_flow,logger_DM00031_flow,logger_DM00033_flow,logger_DM00034_flow,logger_DM00035_flow,logger_DM00036_flow,logger_DM00037_flow,logger_DM00038_flow,logger_DM00039_flow,logger_DM00040_flow,...,logger_DM06910_flow,logger_DM06911_flow,logger_DM06913_flow,logger_DM06914_flow,logger_DM06915_flow,logger_ZM00503_flow,logger_ZM02057_flow,logger_ZM10672_flow,logger_ZM30356_flow,logger_ZM30424_flow
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-13 13:00:00,0.054575,0.705654,0.070107,0.744657,0.621883,0.304961,0.50398,0.384394,0.320209,0.474264,...,0.47619,0.187575,0.49212,0.920278,0.235008,0.481555,0.141005,0.0,0.941865,0.946078
2019-02-13 13:15:00,0.563711,0.772992,0.319003,0.732086,0.064831,0.184365,0.039929,0.656501,0.209605,0.692552,...,0.404762,0.530118,0.401862,0.783797,0.782577,0.391316,0.353323,0.0,0.971076,0.941176
2019-02-13 13:30:00,0.071591,0.572191,0.47744,0.062061,0.054857,0.587832,0.621345,0.647015,0.19553,0.543728,...,0.47619,0.842114,0.55086,0.740529,0.347583,0.468389,0.559697,0.0,0.965886,0.967647
2019-02-13 13:45:00,0.667843,0.189274,0.502521,0.814451,0.846898,0.432684,0.035819,0.220672,0.233096,0.508611,...,0.428571,0.546422,0.088825,0.631575,0.302921,0.482041,0.421394,0.0,0.947473,0.927451
2019-02-13 14:00:00,0.66204,0.552657,0.550396,0.063449,0.275783,0.200376,0.393321,0.791378,0.186686,0.733357,...,0.52381,0.54196,0.25,0.557713,0.617615,0.400359,0.0859,0.0,0.743806,0.914461


In [18]:
print(X.shape, y.shape)

(2877, 1554) (2877, 161)


# alternatively, one can use ONE-STEP customized data generating function

In [282]:
X, y,  scaler,  cols_to_scale, y_cols = generate_data(data)