# Cluster Analysis

In this cluster analysis I develop a call signature for each zipcode based on the ratio of call types for the year ending in May 2019. After adjusting K a few times I found k=6 provides some interesting results, indicating different areas of the city definitely have different call signatures. 

In [1]:
import pandas as pd
import numpy as np
import operator

import pylab as pl
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt

from sklearn.metrics import jaccard_similarity_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

import json # library to handle JSON files

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


#df_joined = pd.read_csv("~/Projects/CourseraCapstone/joined.csv")

Libraries imported.


## Load the data

Zipcode and 311 data are loaded

In [9]:
zips = pd.read_csv("csvs/nyc-zip-code-latitude-and-longitude-2.csv", index_col = 0, usecols=[1,4,5], dtype={0:'int', 1:'int', 4:'float', 5:'float'})

df_ten = pd.read_csv("csvs/zip_by_311.csv", dtype={1: 'str', })
df_ten.head()

Unnamed: 0,Zip,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System
0,10001,0.0,0.111332,0.203446,0.228628,0.104042,0.088138,0.153082,0.003313,0.017893,0.060305
1,10002,0.0139318885448916,0.271156,0.186533,0.241744,0.067595,0.081269,0.063467,0.025026,0.025542,0.023736
2,10003,0.0081444759206798,0.211402,0.172096,0.188031,0.088173,0.157932,0.098442,0.002833,0.03187,0.041076
3,10004,0.0170212765957446,0.029787,0.331915,0.055319,0.059574,0.038298,0.33617,0.076596,0.008511,0.046809
4,10005,0.016,0.068,0.32,0.092,0.128,0.12,0.16,0.036,0.016,0.044


In [3]:
zips.head()

Unnamed: 0,Zip,Latitude,Longitude
10001,10001,40.750742,-73.99653
10002,10002,40.71704,-73.987
10003,10003,40.732509,-73.98935
10004,10004,40.699226,-74.04118
10005,10005,40.706019,-74.00858


## Set up color array 
Set up color array for visualization of clusters

In [4]:
#40.750742;-73.99653
latitude = 40.750742
longitude = -73.79653
map_nyc = folium.Map(location=[latitude, longitude], zoom_start=11)
colors = ['green','red','blue','yellow','orange','purple', 'pink', 'brown', 'white', 'black']

targets = [10002, 10035, 10463, 10003, 10027, 11206, 10021, 10036, 10029, 10024, 10454, 10025, 11372, 10009, 10470, 10026, 10471, 10040, 10010, 10474, 10128, 10019, 11233, 10011, 10473, 11415, 10016, 11238, 11374, 11213, 11224, 10001, 11230, 10023, 10301, 11104, 11225, 10461, 11210, 11207]
def map_it(targets=targets, color='red'):
    # create map of New York using latitude and longitude values
    #map_nyc = folium.Map(location=[latitude, longitude], zoom_start=11)

    for index, row in zips.iterrows():
        zipc, lat, lng = row 
        if (zipc in targets):
            label = '{}'.format(int(zipc))
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color=color,
                fill=True,
                fill_color='#3186cc',
                fill_opacity=0.7,
                parse_html=False).add_to(map_nyc)
    
    
map_it()
map_nyc

In [5]:
df_ten.head()

Unnamed: 0,Zip,Blocked Driveway,HEAT/HOT WATER,Illegal Parking,Noise - Residential,Noise - Street/Sidewalk,Request Large Bulky Item Collection,Street Condition,Street Light Condition,UNSANITARY CONDITION,Water System
0,10001,0.0,0.111332,0.203446,0.228628,0.104042,0.088138,0.153082,0.003313,0.017893,0.060305
1,10002,0.0139318885448916,0.271156,0.186533,0.241744,0.067595,0.081269,0.063467,0.025026,0.025542,0.023736
2,10003,0.0081444759206798,0.211402,0.172096,0.188031,0.088173,0.157932,0.098442,0.002833,0.03187,0.041076
3,10004,0.0170212765957446,0.029787,0.331915,0.055319,0.059574,0.038298,0.33617,0.076596,0.008511,0.046809
4,10005,0.016,0.068,0.32,0.092,0.128,0.12,0.16,0.036,0.016,0.044


In [6]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
colors = ['green','red','blue','yellow','orange','purple']
map_nyc = folium.Map(location=[latitude, longitude], zoom_start=11)

#print(df_ten.head())

X = df_ten.drop(columns=['Zip']).to_numpy()
nbrs = NearestNeighbors(n_neighbors=40, radius=0.1, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)

for i in range(0,6):
    target = []
    for index, rows in pd.DataFrame(df_ten.iloc[indices[i*33]]['Zip']).iterrows():
        target.append(rows.Zip)
    map_it(target,colors[i])
    
map_nyc

In [7]:
from sklearn.cluster import KMeans
import numpy as np
X = df_ten.drop(columns=['Zip']).to_numpy()
kmeans = KMeans(n_clusters=6, random_state=0).fit(X)

df_tenx = df_ten
df_tenx['kmeans_label'] = kmeans.labels_
df_tenx['kmeans_color'] = df_tenx['kmeans_label'].apply(lambda x: colors[x%10])

colors = ['green','red','blue','yellow','orange','purple', 'brown', 'pink', 'white', 'black']
df_joined = df_tenx.join(zips, on="Zip", how="inner", rsuffix="R")

df_tenx.drop(columns=['kmeans_label','kmeans_color'], inplace=True)

# create map of New York using latitude and longitude values
map_nyc = folium.Map(location=[latitude, longitude], zoom_start=11)

for index, row in df_joined.iterrows():
    zipc, a,b,c, d,e,f, g,h,i,j, kmeans_label, c1, z2, lat, lng, = row 
    label = '{} {} {} {}'.format(c1, int(zipc), kmeans_label, c1)
    label = folium.Popup(label, parse_html=True)
#    print(zipc, kmeans_label, c1, '"{}" "{}" "{}"'.format(int(zipc), kmeans_label, c1))
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c1,
#        fill=True,
#        fill_color='#3186cc',
#        fill_opacity=0.7,
        parse_html=False).add_to(map_nyc)

    
#map_it()
map_nyc



In [8]:
map_nyc.save('maps/map.html')
#df_joined[df_joined.kmeans_label.isin([0,2])].to_csv("csvs/labeled_zips.csv")
df_joined.to_csv("csvs/labeled_zips.csv")


