# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [180]:
import pandas as pd
import numpy as np
import folium


In [181]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [182]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [183]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [184]:
from sklearn.cluster import KMeans

In [185]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [186]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster
coordinates = coordinates.to_numpy()[:100000,:]

In [187]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [188]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [189]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [190]:
def show_cluster(cluster_number):
        #print("ClusterSize: ", np.count_nonzero(myKMeans.labels_ == cluster_number))
        clusterCoords = coordinates[myKMeans.labels_ == cluster_number]
        for i in range(np.count_nonzero(myKMeans.labels_ == cluster_number)):
            folium.CircleMarker([clusterCoords[i,0],clusterCoords[i,1]],radius = 2, color="green",fill_opacity=0.9).add_to(mymap)
            folium.CircleMarker([clusterCoords[i,2],clusterCoords[i,3]],radius = 2, color="red",fill_opacity=0.9).add_to(mymap)   

In [191]:
mymap = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
#a = myKMeans.labels_ == 3
#b = coordinates[a]
#print("length: ", np.count_nonzero(myKMeans.labels_ == 3))
#print(myKMeans.labels_.size)
#coordinates[0:10]

In [192]:
#show_cluster()
show_cluster(57)
mymap

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [194]:
#dif = np.zeros(100)
def cluster_var(cluster_number):
    clusterCoords = coordinates[myKMeans.labels_ == cluster_number]
    otherCoords = coordinates[myKMeans.labels_ != cluster_number]
    intra_var = clusterCoords.var(axis =0)
    extra_var = otherCoords.var(axis =0)
    print("intra varianz of cluster",cluster_number, ':',intra_var)
    print("extra varianz of cluster",cluster_number, ':',extra_var)
    #dif[cluster_number] = extra_var - intra_var
    
for i in range(100):
    cluster_var(i)
#clusterCoords[0:10]

intra varianz of cluster 0 : [2.73993567e-05 2.63520284e-05 2.91517415e-05 3.86691922e-05]
extra varianz of cluster 0 : [0.00091607 0.00154464 0.00111608 0.00146521]
intra varianz of cluster 1 : [4.59026694e-05 3.02281424e-05 3.90400417e-05 2.64173214e-05]
extra varianz of cluster 1 : [0.00090277 0.00154014 0.00109979 0.00146374]
intra varianz of cluster 2 : [0.00034131 0.00237302 0.00146614 0.00513892]
extra varianz of cluster 2 : [0.00089766 0.00150465 0.00108669 0.00140676]
intra varianz of cluster 3 : [4.04529449e-05 4.40527727e-05 4.50652520e-05 3.10579382e-05]
extra varianz of cluster 3 : [0.00091323 0.00153537 0.00110045 0.00145515]
intra varianz of cluster 4 : [4.61573282e-05 4.33919270e-05 2.52071245e-04 1.80735021e-04]
extra varianz of cluster 4 : [0.00088018 0.00144677 0.00108584 0.00143193]
intra varianz of cluster 5 : [2.01948392e-28 2.01948392e-28 2.01948392e-28 2.01948392e-28]
extra varianz of cluster 5 : [0.00081075 0.00134482 0.00100015 0.00126941]
intra varianz of clu

intra varianz of cluster 52 : [2.72134548e-05 3.47180274e-05 3.89887273e-05 3.58421222e-05]
extra varianz of cluster 52 : [0.00090214 0.00153587 0.0010961  0.0014591 ]
intra varianz of cluster 53 : [0.00018847 0.00012457 0.00017878 0.00011249]
extra varianz of cluster 53 : [0.00088102 0.00151208 0.00109219 0.00143745]
intra varianz of cluster 54 : [1.22428850e-04 6.51276455e-05 7.09424337e-05 6.34767530e-05]
extra varianz of cluster 54 : [0.0008983  0.00151341 0.00108483 0.00143687]
intra varianz of cluster 55 : [0.00011331 0.000106   0.00060036 0.00057413]
extra varianz of cluster 55 : [0.00089844 0.00148473 0.00107741 0.00143186]
intra varianz of cluster 56 : [6.26088104e-05 4.39542872e-05 2.93030102e-05 2.23454118e-05]
extra varianz of cluster 56 : [0.0009079  0.00153255 0.00108263 0.00145653]
intra varianz of cluster 57 : [0.00028612 0.00011742 0.00015312 0.0001685 ]
extra varianz of cluster 57 : [0.0008978  0.0015082  0.00106313 0.00134976]
intra varianz of cluster 58 : [0.0002804

In [None]:
dif.argsort()

In [198]:
#define number of clusters and create instance
clusters=10
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

myKMeans.fit(coordinates[:100000,:])#use only subset of the data to make it faster
coordinates = coordinates[:100000,:]

centers=myKMeans.cluster_centers_

#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [199]:
cluster_map

In [200]:
for i in range(10):
    cluster_var(i)

intra varianz of cluster 0 : [0.00026608 0.000339   0.00119282 0.00144457]
extra varianz of cluster 0 : [0.00090562 0.00127656 0.00108403 0.00142448]
intra varianz of cluster 1 : [0.00039261 0.00020733 0.00045052 0.00018785]
extra varianz of cluster 1 : [0.00084878 0.00183095 0.00089985 0.00169586]
intra varianz of cluster 2 : [0.00037159 0.00022421 0.00046638 0.00057676]
extra varianz of cluster 2 : [0.00089493 0.00173643 0.00084476 0.00150865]
intra varianz of cluster 3 : [0.00054689 0.00109889 0.00084692 0.00066203]
extra varianz of cluster 3 : [0.00090052 0.00150937 0.0010139  0.00114474]
intra varianz of cluster 4 : [0.01521072 0.01583351 0.08670942 0.14607481]
extra varianz of cluster 4 : [0.00074876 0.00123192 0.00099066 0.00125321]
intra varianz of cluster 5 : [0.00047746 0.00058817 0.00119409 0.00072436]
extra varianz of cluster 5 : [0.00091002 0.00153314 0.00107367 0.00119508]
intra varianz of cluster 6 : [0.00303323 0.00339667 0.00644365 0.00767287]
extra varianz of cluster 

Weniger Cluster führen zu größeren Clustern, die dasnn auch eine höhere interne Varianz haben.
Die externe Varianz hat keine merklichen Unterschiede.