# Model 5 - Deep Learning

This model takes the information from the ETL module, and creates a deep learning algorithm that should predict call volumes for each zipcode based on weather. Let's see


In [1]:
import pandas as pd
import numpy as np

%matplotlib inline 
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.metrics import jaccard_similarity_score

from sklearn import preprocessing
from sklearn.svm import SVC
print('Libraries imported.')


Libraries imported.


In [2]:
def poly2(feature, degree=2, we=0):
    a = df_joined['Complaint Type']==feature
    
    if (we == 1): #weekday
        a = (df_joined['dow'].isin([0,1,2,3,4])) & a 
    if (we == 2): #weekend
        a = df_joined['dow'].isin([5,6]) & a

#    print (a.head())        

    m = pd.DataFrame()
    m['Temperature'] = df_joined['DailyAverageDryBulbTemperature'][a]
    m['Number of Calls'] = df_joined['Count'][a]

    m2 = pd.DataFrame(m.groupby('Temperature').mean())
    m2.reset_index(inplace=True)

    X = np.asarray(m2['Temperature'])
    y = np.asarray(m2['Number of Calls'])

    X = X[:, np.newaxis]
    y = y[:, np.newaxis]

    polynomial_features= PolynomialFeatures(degree=degree)
    x_poly = polynomial_features.fit_transform(X)

    model = LinearRegression()
    model.fit(x_poly, y)
    y_poly_pred = model.predict(x_poly)

    rmse = np.sqrt(mean_squared_error(y,y_poly_pred))
    r2 = r2_score(y,y_poly_pred)
    print("RSME = ",rmse, " r2 = ",r2)


    plt.scatter(X, y, s=10)
    # sort the values of x before line plot
    sort_axis = operator.itemgetter(0)
    sorted_zip = sorted(zip(X,y_poly_pred), key=sort_axis)
    X, y_poly_pred = zip(*sorted_zip)
    plt.plot(X, y_poly_pred, color='m')
    plt.show()

#poly2('HEAT/HOT WATER')

## Zipcodes

The zipcode data includes the cluster data that was generated in Model 4

In [3]:
df_zips = pd.read_csv("csvs/labeled_zips.csv", index_col=0, usecols=[1,12,13], dtype={1:'str',12:'int',13:'str'})
df_zips.head()

Unnamed: 0_level_0,kmeans_label,kmeans_color
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1
10001,2,blue
10002,0,green
10003,0,green
10004,2,blue
10005,2,blue


## Feature Creation

The new feature was the cluster data generated in Model 4


In [4]:
df_311zip = pd.read_csv("csvs/top_ten_w_zips.csv", usecols=[3,5,6], dtype={0:'str',3:'str',5:'str',6:'str',})
df_311zip.rename(columns={'Date Only':'Date'}, inplace=True)

df_311zip.head()

Unnamed: 0,Complaint Type,Incident Zip,Date
0,Noise - Residential,10023,2015-10-10
1,Noise - Residential,10023,2015-10-10
2,HEAT/HOT WATER,10023,2015-10-10
3,HEAT/HOT WATER,10023,2015-10-10
4,Noise - Residential,10023,2015-10-10


In [23]:
def getFeatureVector(azip=10001, minDate = '2018-05-29', maxDate='2019-05-29'):
    df = df_311zip[df_311zip['Incident Zip'] == str(azip)]
    df = df[df['Date'] >= minDate]
    df = df[df['Date'] <= maxDate]
    df['Count'] = 1
    n = df.shape[0]
    pt = pd.pivot_table(df,  index=['Incident Zip'], columns=['Complaint Type'],
                  aggfunc=np.sum, fill_value=0)

    pt = pt.div(n)
    pt['label'] = df_zips.loc[azip].kmeans_label
    return pt
    #return df_zips.join(pt, on="Incident Zip")
    
getFeatureVector()

Unnamed: 0_level_0,Count,Count,Count,Count,Count,Count,Count,Count,Count,Count,label
Complaint Type,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System,Unnamed: 11_level_1
Incident Zip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
10001,0.045202,0.083472,0.203827,0.226844,0.0868,0.103716,0.145591,0.00305,0.032446,0.069052,2


In [47]:
df_learn = pd.DataFrame()

for index, row in df_zips.iterrows():
    df_learn = df_learn.append(getFeatureVector(index))
    
df_learn

Unnamed: 0_level_0,Count,Count,Count,Count,Count,Count,Count,Count,Count,Count,label
Complaint Type,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System,Unnamed: 11_level_1
Incident Zip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
10001,0.045202,0.083472,0.203827,0.226844,0.086800,0.103716,0.145591,0.003050,0.032446,0.069052,2
10002,0.014701,0.215328,0.178683,0.239434,0.100530,0.084856,0.057940,0.027889,0.042266,0.038374,0
10003,0.007597,0.155966,0.139729,0.183525,0.109191,0.180694,0.106659,0.002979,0.042604,0.071056,0
10004,0.014337,0.019713,0.342294,0.039427,0.059140,0.053763,0.290323,0.096774,0.007168,0.077061,2
10005,0.015101,0.035235,0.288591,0.117450,0.172819,0.119128,0.140940,0.050336,0.015101,0.045302,2
10006,0.027197,0.048117,0.351464,0.098326,0.108787,0.150628,0.106695,0.039749,0.035565,0.033473,2
10007,0.013344,0.070892,0.322769,0.106756,0.082569,0.096747,0.168474,0.059216,0.004170,0.075063,2
10009,0.019946,0.168575,0.081328,0.329945,0.111697,0.134217,0.046455,0.001158,0.061253,0.045425,0
10010,0.010238,0.131993,0.166728,0.204022,0.092870,0.135649,0.112980,0.001097,0.046435,0.097989,2
10011,0.020015,0.134961,0.112009,0.172053,0.089056,0.192618,0.105215,0.004774,0.064635,0.104664,2


In [50]:
df_learn = df_learn.fillna(0)

In [65]:
#print (df_learn.columns)
df_learn['Count']
         
#         , 'HEAT/HOT WATER', 'Illegal Parking', 'Noise - Residential', 'Noise - Street/Sidewalk', 'Request Large Bulky Item Collection', 'Street Condition', 'Street Light Condition', 'UNSANITARY CONDITION', 'Water System']])


Complaint Type,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System
Incident Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10001,0.045202,0.083472,0.203827,0.226844,0.086800,0.103716,0.145591,0.003050,0.032446,0.069052
10002,0.014701,0.215328,0.178683,0.239434,0.100530,0.084856,0.057940,0.027889,0.042266,0.038374
10003,0.007597,0.155966,0.139729,0.183525,0.109191,0.180694,0.106659,0.002979,0.042604,0.071056
10004,0.014337,0.019713,0.342294,0.039427,0.059140,0.053763,0.290323,0.096774,0.007168,0.077061
10005,0.015101,0.035235,0.288591,0.117450,0.172819,0.119128,0.140940,0.050336,0.015101,0.045302
10006,0.027197,0.048117,0.351464,0.098326,0.108787,0.150628,0.106695,0.039749,0.035565,0.033473
10007,0.013344,0.070892,0.322769,0.106756,0.082569,0.096747,0.168474,0.059216,0.004170,0.075063
10009,0.019946,0.168575,0.081328,0.329945,0.111697,0.134217,0.046455,0.001158,0.061253,0.045425
10010,0.010238,0.131993,0.166728,0.204022,0.092870,0.135649,0.112980,0.001097,0.046435,0.097989
10011,0.020015,0.134961,0.112009,0.172053,0.089056,0.192618,0.105215,0.004774,0.064635,0.104664


In [70]:
from sklearn.neural_network import MLPClassifier


#print(df_learn[['label']])

X = np.asarray(df_learn[['Count']])
#'Blocked Driveway', 'HEAT/HOT WATER', 'Illegal Parking', 'Noise - Residential', 'Noise - Street/Sidewalk', 'Request Large Bulky Item Collection', 'Street Condition', 'Street Light Condition', 'UNSANITARY CONDITION', 'Water System']])
y = np.asarray(df_learn[['label']])

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

#y_train = np.nan_to_num(y_train)
#df = pd.DataFrame(y_train)
#df[df.isin([np.nan, np.inf, -np.inf]).any(1)]
#y

Train set: (168, 10) (168, 1)
Test set: (43, 10) (43, 1)


### about MLP

MLP trains on two arrays: array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors; and array y of size (n_samples,), which holds the target values (class labels) for the training sample

In [71]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10), random_state=1)

In [72]:
y.shape

(211, 1)

In [73]:
clf.fit(X_train, y_train) 

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=10, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [74]:
yhat = clf.predict(X_test)

In [75]:
mean_absolute_error(y_test, yhat)

0.09302325581395349

In [76]:
r2_score(y_test, yhat)

0.833976833976834

In [77]:
mean_squared_error(y_test, yhat)

0.18604651162790697