# Model 5 - Deep Learning

This model takes the information from the ETL module, and creates a deep learning algorithm that should predict call volumes for each zipcode based on weather. Let's see


In [1]:
import pandas as pd
import numpy as np

%matplotlib inline 
import matplotlib.pyplot as plt


#import keras
#from keras.layers import Dense
#from keras.models import Sequential
import sklearn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import SVC
print('Libraries imported.')


Libraries imported.


## Zipcodes

The zipcode data includes the cluster data that was generated in Model 4

In [2]:
df_zips = pd.read_csv("csvs/labeled_zips.csv", index_col=0, usecols=[1,12,13], dtype={1:'str',12:'int',13:'str'})
df_zips.shape

(211, 2)

## Feature Creation

The new feature was the cluster data generated in Model 4


In [3]:
df_311zip = pd.read_csv("csvs/top_ten_w_zips.csv", usecols=[3,5,6], dtype={0:'str',3:'str',5:'str',6:'str',})
df_311zip.rename(columns={'Date Only':'Date', 'Incident Zip':'Zip'}, inplace=True)

df_311zip.head()

Unnamed: 0,Complaint Type,Zip,Date
0,Noise - Residential,10023,2015-10-10
1,Noise - Residential,10023,2015-10-10
2,HEAT/HOT WATER,10023,2015-10-10
3,HEAT/HOT WATER,10023,2015-10-10
4,Noise - Residential,10023,2015-10-10


## Feature Extraction

Build a pivot table of the ratio of types of calls for each zipcode within the given timeframe

In [4]:
def getFeatureVector(normalize=True, minDate = '2018-05-28', maxDate='2019-05-28'):
    df = df_311zip#[df_311zip['Incident Zip'] == str(azip)]
    df = df[df['Date'] > minDate]
    df = df[df['Date'] <= maxDate]
    df['Count'] = 1
    n = df.shape[0]
    n = df.groupby('Zip').count()
    pt = pd.pivot_table(df,  index=['Zip'], columns=['Complaint Type'],
                  aggfunc=np.sum, fill_value=0)

    if (normalize):
        pt = pt.div(n, level=0)
    else:
        pt = pt.div(1.0, level=0)
        
    pt.Count.to_csv("temp.csv")
    df_feat2 = pd.read_csv("temp.csv", index_col=0)
    return df_zips.join(df_feat2, how="inner")
    
df_learn = getFeatureVector(False)
df_learn.head()

Unnamed: 0_level_0,kmeans_label,kmeans_color,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10001,2,blue,163.0,301.0,735.0,818.0,313.0,374.0,525.0,11.0,117.0,249.0
10002,0,green,136.0,1992.0,1653.0,2215.0,930.0,785.0,536.0,258.0,391.0,355.0
10003,0,green,51.0,1047.0,938.0,1232.0,733.0,1213.0,716.0,20.0,286.0,477.0
10004,2,blue,8.0,11.0,191.0,22.0,33.0,30.0,162.0,54.0,4.0,43.0
10005,2,blue,9.0,21.0,172.0,70.0,103.0,71.0,84.0,30.0,9.0,27.0


In [5]:
def scaleData(data):
    #normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    return scaler.fit_transform(data)

In [6]:
feature_list = ['Blocked Driveway', 'HEAT/HOT WATER', 'Illegal Parking', 'Noise - Residential', 
                'Noise - Street/Sidewalk', 'Request Large Bulky Item Collection', 'Street Condition', 
                'Street Light Condition', 'UNSANITARY CONDITION', 'Water System']

#feature_list = ['HEAT/HOT WATER', 'Noise - Street/Sidewalk', 'Water System']

#feature_list = ['Blocked Driveway', 'HEAT/HOT WATER', 'Illegal Parking', 'Noise - Residential', 
#                'Noise - Street/Sidewalk', 'Street Condition', 
#                'Street Light Condition', 'UNSANITARY CONDITION', 'Water System']



In [26]:
from sklearn.neural_network import MLPClassifier

normalize = True
df_learn = getFeatureVector(normalize,minDate = '2018-05-28', maxDate='2019-05-28')

X_train = np.asarray(df_learn[feature_list])
y_train = np.asarray(df_learn[['kmeans_label']]).ravel()
X_train = scaleData(X_train)

df_learn = getFeatureVector(normalize, minDate = '2018-05-28', maxDate='2019-05-28')
X_test = np.asarray(df_learn[feature_list])
y_test = np.asarray(df_learn[['kmeans_label']]).ravel()
X_test = scaleData(X_test)

#X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)



Train set: (211, 10) (211,)
Test set: (211, 10) (211,)


In [27]:
#X = np.asarray(df_learn[feature_list])
#X = scaleData(X)
#y = np.asarray(df_learn[['kmeans_label']]).ravel()#
#
#X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
#print ('Train set:', X_train.shape,  y_train.shape)
#print ('Test set:', X_test.shape,  y_test.shape)



In [28]:
#X_train[:5]

### about MLP

MLP trains on two arrays: array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors; and array y of size (n_samples,), which holds the target values (class labels) for the training sample

In [32]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(11), random_state=1)

In [33]:
clf.fit(X_train, y_train) 

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=11, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [34]:
#X_test = scaleData(X_test)

yhat = clf.predict(X_test)
{"ME":mean_absolute_error(y_test, yhat), "R2":r2_score(y_test, yhat), "MSE":mean_squared_error(y_test, yhat)}

{'ME': 0.004739336492890996,
 'R2': 0.9968201820483453,
 'MSE': 0.004739336492890996}

In [35]:
df_learn = getFeatureVector(normalize, minDate = '2018-04-28', maxDate='2019-04-28')
X_test = np.asarray(df_learn[feature_list])
y_test = np.asarray(df_learn[['kmeans_label']]).ravel()
X_test = scaleData(X_test)

yhat = clf.predict(X_test)
{"ME":mean_absolute_error(y_test, yhat), "R2":r2_score(y_test, yhat), "MSE":mean_squared_error(y_test, yhat)}

{'ME': 0.11428571428571428,
 'R2': 0.8346691020170369,
 'MSE': 0.23809523809523808}

In [35]:
df_learn = getFeatureVector(normalize, minDate = '2018-04-28', maxDate='2019-04-28')
X_test = np.asarray(df_learn[feature_list])
y_test = np.asarray(df_learn[['kmeans_label']]).ravel()
X_test = scaleData(X_test)

yhat = clf.predict(X_test)
{"ME":mean_absolute_error(y_test, yhat), "R2":r2_score(y_test, yhat), "MSE":mean_squared_error(y_test, yhat)}

{'ME': 0.11428571428571428,
 'R2': 0.8346691020170369,
 'MSE': 0.23809523809523808}