In [1]:
# Source: https://www.datacamp.com/community/tutorials/deep-learning-python

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score

Using TensorFlow backend.


In [48]:
# Read the dataset
dataset = pd.read_csv("merged_mmda_wwo_Taft Ave._2015.csv", skipinitialspace=True)

# Remove date time

dataset

Unnamed: 0,stationName,lineName,dt,statusN,statusS,tempC,tempF,windspeedMiles,windspeedKmph,winddirDegree,...,heatIndexC,heatIndexF,dewPointC,dewPointF,windChillC,windChillF,windGustMiles,windGustKmph,feelsLikeC,feelsLikeF
0,Taft Ave.,EDSA,9/1/2015 0:00,0.5,0.5,0.3000,0.305556,0.142857,0.113636,0.770538,...,0.344828,0.358491,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.358491
1,Taft Ave.,EDSA,9/1/2015 0:15,0.5,0.5,0.3000,0.305556,0.133929,0.113636,0.779037,...,0.344828,0.353774,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.353774
2,Taft Ave.,EDSA,9/1/2015 0:30,0.0,0.0,0.3000,0.305556,0.125000,0.113636,0.787535,...,0.344828,0.349057,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.349057
3,Taft Ave.,EDSA,9/1/2015 0:45,0.0,0.0,0.3000,0.305556,0.116071,0.113636,0.796034,...,0.344828,0.344340,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.344340
4,Taft Ave.,EDSA,9/1/2015 1:00,0.0,0.0,0.3000,0.305556,0.107143,0.113636,0.804533,...,0.344828,0.339623,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.339623
5,Taft Ave.,EDSA,9/1/2015 1:15,0.0,0.0,0.3000,0.305556,0.107143,0.107955,0.813739,...,0.344828,0.339623,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.339623
6,Taft Ave.,EDSA,9/1/2015 1:30,0.0,0.0,0.3000,0.305556,0.107143,0.102273,0.822946,...,0.344828,0.339623,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.339623
7,Taft Ave.,EDSA,9/1/2015 1:45,0.0,0.0,0.3000,0.305556,0.107143,0.096591,0.832153,...,0.344828,0.339623,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.339623
8,Taft Ave.,EDSA,9/1/2015 2:00,0.0,0.0,0.3000,0.305556,0.107143,0.090909,0.841360,...,0.344828,0.339623,0.692308,0.72,0.3000,0.305556,0.090909,0.097222,0.344828,0.339623
9,Taft Ave.,EDSA,9/1/2015 2:15,0.0,0.0,0.3000,0.305556,0.107143,0.090909,0.849858,...,0.336207,0.339623,0.711538,0.72,0.3000,0.305556,0.090909,0.097222,0.336207,0.339623


In [49]:
# Remove unused columms
cols_to_remove = list(range(0,4))
cols_to_remove = cols_to_remove + [6, 7, 17, 19, 21, 22, 25]

dataset.drop(dataset.columns[[cols_to_remove]], axis=1, inplace=True)

dataset

Unnamed: 0,statusS,tempC,windspeedKmph,winddirDegree,cond,precipMM,humidity,visibility,pressure,cloudcover,heatIndexC,dewPointC,windChillC,windGustKmph,feelsLikeC
0,0.5,0.3000,0.113636,0.770538,0.631579,0.0,0.869565,1.00000,0.724138,0.3300,0.344828,0.692308,0.3000,0.097222,0.344828
1,0.5,0.3000,0.113636,0.779037,0.631579,0.0,0.873188,1.00000,0.715517,0.3225,0.344828,0.692308,0.3000,0.097222,0.344828
2,0.0,0.3000,0.113636,0.787535,0.631579,0.0,0.876812,1.00000,0.706897,0.3150,0.344828,0.692308,0.3000,0.097222,0.344828
3,0.0,0.3000,0.113636,0.796034,0.631579,0.0,0.880435,1.00000,0.698276,0.3075,0.344828,0.692308,0.3000,0.097222,0.344828
4,0.0,0.3000,0.113636,0.804533,0.631579,0.0,0.884058,1.00000,0.689655,0.3000,0.344828,0.692308,0.3000,0.097222,0.344828
5,0.0,0.3000,0.107955,0.813739,0.710526,0.0,0.887681,0.96875,0.689655,0.2950,0.344828,0.692308,0.3000,0.097222,0.344828
6,0.0,0.3000,0.102273,0.822946,0.789474,0.0,0.891304,0.93750,0.689655,0.2900,0.344828,0.692308,0.3000,0.097222,0.344828
7,0.0,0.3000,0.096591,0.832153,0.868421,0.0,0.894928,0.90625,0.689655,0.2850,0.344828,0.692308,0.3000,0.097222,0.344828
8,0.0,0.3000,0.090909,0.841360,0.947368,0.0,0.898551,0.87500,0.689655,0.2800,0.344828,0.692308,0.3000,0.097222,0.344828
9,0.0,0.3000,0.090909,0.849858,0.947368,0.0,0.902174,0.87500,0.689655,0.2725,0.336207,0.711538,0.3000,0.097222,0.336207


# Correlate

In [50]:
corr = dataset.corr()
# cmap = mcolors.LinearSegmentedColormap.from_list("n",['#000066','#ffffff','#ff0000'])
# sns.heatmap(corr, 
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values, cmap=cmap)
# plt.show()
corr

Unnamed: 0,statusS,tempC,windspeedKmph,winddirDegree,cond,precipMM,humidity,visibility,pressure,cloudcover,heatIndexC,dewPointC,windChillC,windGustKmph,feelsLikeC
statusS,1.0,0.233793,-0.061697,0.147612,0.041832,-0.001916,-0.12938,-0.069184,0.012383,-0.035636,0.258625,0.246288,0.23414,-0.091736,0.258625
tempC,0.233793,1.0,0.057149,0.131597,0.516965,-0.094473,-0.866083,0.078984,-0.04826,-0.088222,0.962707,0.371528,0.999856,-0.106498,0.962707
windspeedKmph,-0.061697,0.057149,1.0,0.032419,0.071692,0.490596,-0.206757,-0.348905,-0.367229,0.250707,-0.030717,-0.265718,0.058165,0.949985,-0.030717
winddirDegree,0.147612,0.131597,0.032419,1.0,0.177716,0.260487,0.144877,-0.305434,-0.677325,0.358074,0.246062,0.550435,0.132156,0.093122,0.246062
cond,0.041832,0.516965,0.071692,0.177716,1.0,-0.044952,-0.451364,-0.047908,-0.122244,0.02437,0.497157,0.185154,0.517093,-0.005308,0.497157
precipMM,-0.001916,-0.094473,0.490596,0.260487,-0.044952,1.0,0.263911,-0.814142,-0.547106,0.518895,-0.021203,0.261165,-0.093506,0.618706,-0.021203
humidity,-0.12938,-0.866083,-0.206757,0.144877,-0.451364,0.263911,1.0,-0.284183,-0.174014,0.218607,-0.736196,0.10785,-0.865573,0.013607,-0.736196
visibility,-0.069184,0.078984,-0.348905,-0.305434,-0.047908,-0.814142,-0.284183,1.0,0.505505,-0.5626,-0.010396,-0.333743,0.07753,-0.497627,-0.010396
pressure,0.012383,-0.04826,-0.367229,-0.677325,-0.122244,-0.547106,-0.174014,0.505505,1.0,-0.463487,-0.141522,-0.414492,-0.049274,-0.460404,-0.141522
cloudcover,-0.035636,-0.088222,0.250707,0.358074,0.02437,0.518895,0.218607,-0.5626,-0.463487,1.0,-0.017338,0.20928,-0.08761,0.36764,-0.017338


# Split training and test dataset

In [139]:
# To-be Predicted variable
Y = dataset.statusS
Y = Y.round(5)

# Other data
# X = dataset.drop('statusS', axis=1) 
X = dataset
X.statusS = X.statusS.round(5)

# # Split the data up in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.67, random_state=42)

X

Unnamed: 0,statusS,tempC,windspeedKmph,winddirDegree,cond,precipMM,humidity,visibility,pressure,cloudcover,heatIndexC,dewPointC,windChillC,windGustKmph,feelsLikeC
0,0.5,0.3000,0.113636,0.770538,0.631579,0.0,0.869565,1.00000,0.724138,0.3300,0.344828,0.692308,0.3000,0.097222,0.344828
1,0.5,0.3000,0.113636,0.779037,0.631579,0.0,0.873188,1.00000,0.715517,0.3225,0.344828,0.692308,0.3000,0.097222,0.344828
2,0.0,0.3000,0.113636,0.787535,0.631579,0.0,0.876812,1.00000,0.706897,0.3150,0.344828,0.692308,0.3000,0.097222,0.344828
3,0.0,0.3000,0.113636,0.796034,0.631579,0.0,0.880435,1.00000,0.698276,0.3075,0.344828,0.692308,0.3000,0.097222,0.344828
4,0.0,0.3000,0.113636,0.804533,0.631579,0.0,0.884058,1.00000,0.689655,0.3000,0.344828,0.692308,0.3000,0.097222,0.344828
5,0.0,0.3000,0.107955,0.813739,0.710526,0.0,0.887681,0.96875,0.689655,0.2950,0.344828,0.692308,0.3000,0.097222,0.344828
6,0.0,0.3000,0.102273,0.822946,0.789474,0.0,0.891304,0.93750,0.689655,0.2900,0.344828,0.692308,0.3000,0.097222,0.344828
7,0.0,0.3000,0.096591,0.832153,0.868421,0.0,0.894928,0.90625,0.689655,0.2850,0.344828,0.692308,0.3000,0.097222,0.344828
8,0.0,0.3000,0.090909,0.841360,0.947368,0.0,0.898551,0.87500,0.689655,0.2800,0.344828,0.692308,0.3000,0.097222,0.344828
9,0.0,0.3000,0.090909,0.849858,0.947368,0.0,0.902174,0.87500,0.689655,0.2725,0.336207,0.711538,0.3000,0.097222,0.336207


# Normalize

In [140]:
# Scale the data with `StandardScaler`
X = StandardScaler().fit_transform(X)
X

array([[-0.17205682, -0.8340338 , -0.46651226, ..., -0.83433809,
        -0.4507312 , -0.76847605],
       [-0.17205682, -0.8340338 , -0.46651226, ..., -0.83433809,
        -0.4507312 , -0.76847605],
       [-1.53478423, -0.8340338 , -0.46651226, ..., -0.83433809,
        -0.4507312 , -0.76847605],
       ..., 
       [-0.17205682, -1.16326809,  0.01287055, ..., -1.16357348,
         0.17349007, -1.16888202],
       [-0.17205682, -1.16326809, -0.03070972, ..., -1.16357348,
         0.11674268, -1.16888202],
       [-0.17205682, -1.16326809, -0.07428997, ..., -1.16357348,
         0.05999529, -1.16888202]])

# Build the Model

In [141]:
# # Initialize the constructor
# model = Sequential()

# # Add an input layer 
# # Hidden units = 12
# # If you would allow more hidden units, your network will be able to learn more complex representations
# # but it will also be a more expensive operations that can be prone to overfitting.
# model.add(Dense(12, activation='relu', input_shape=(11,)))

# # Add one hidden layer 
# model.add(Dense(8, activation='relu'))

# # Add an output layer 
# model.add(Dense(1, activation='sigmoid'))


# Initialize the model
model = Sequential()

# Add input layer 
# input_dim = number of X.columns
model.add(Dense(64, input_dim=15, activation='relu'))

# Add one hidden layer 
model.add(Dense(8, activation='relu'))

# model.add(Dense(64, activation='relu'))
    
# Add output layer 
model.add(Dense(1))


# Model output shape
model.output_shape

# Model summary
model.summary()

# Model config
model.get_config()

# List all weight tensors 
model.get_weights()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 64)                1024      
_________________________________________________________________
dense_59 (Dense)             (None, 8)                 520       
_________________________________________________________________
dense_60 (Dense)             (None, 1)                 9         
Total params: 1,553
Trainable params: 1,553
Non-trainable params: 0
_________________________________________________________________


[array([[-0.2483802 ,  0.24217185, -0.19064045,  0.19831499,  0.1033107 ,
         -0.15804252,  0.07441825, -0.24360603, -0.18017492,  0.03427058,
          0.16839805,  0.18535548,  0.15771434, -0.00043583,  0.04886222,
         -0.17641032, -0.2289439 ,  0.08679214, -0.08043282, -0.07880062,
          0.16373551,  0.08473849, -0.150635  , -0.05847539, -0.09565338,
          0.19566324, -0.0470992 , -0.09462759, -0.20673567,  0.02722183,
         -0.0148147 ,  0.05523723, -0.1188295 , -0.11338383, -0.19522551,
          0.23450902, -0.08967936, -0.0047904 , -0.14766414,  0.04648   ,
          0.0927304 , -0.25498545, -0.24677758,  0.19280368,  0.16946012,
         -0.05696686,  0.21427277, -0.266765  , -0.25142708, -0.09301576,
         -0.09406146,  0.16380906, -0.09860487,  0.15338406, -0.07054229,
          0.19708925, -0.05418409, -0.24140161,  0.12478158,  0.24528506,
          0.18856159,  0.24695286,  0.03472158, -0.1495855 ],
        [ 0.07852119,  0.2620863 ,  0.08011684, -0

# Compile and Fit

In [142]:
model.compile(loss='mse',
              optimizer='adam',
              metrics=['mean_absolute_percentage_error'])
                   
model.fit(np.array(X_train), np.array(y_train),epochs=1000, batch_size=5, verbose=0)

<keras.callbacks.History at 0x12491b398d0>

# Predict Values

In [143]:
y_pred = model.predict(np.array(X_test))
print(y_pred[:5])
print(y_test[:5])

[[  1.00047910e+00]
 [  1.00031495e+00]
 [  3.91986687e-05]
 [  5.00632882e-01]
 [  1.00025356e+00]]
1142    1.0
1417    1.0
7012    0.0
1287    0.5
1701    1.0
Name: statusS, dtype: float64


# Evaluate Model

In [144]:
# Closer to zero are better
mse_value, mae_value = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)

print('MSE = ' + '{0:f}'.format(mse_value))
print('MAE = ' + '{0:f}'.format(mae_value))

MSE = 0.000000
MAE = 8396.904322


In [145]:
# 1.0 = best
r2_score(y_test, y_pred)

0.99999838840605093