# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [38]:
import pandas as pd
import numpy as np
import folium


In [39]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [40]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [41]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [42]:
from sklearn.cluster import KMeans

In [43]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [44]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster
coordinates = coordinates.to_numpy()[:100000,:]

In [45]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [46]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [47]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [121]:
def show_cluster(cluster_number):
        #print("ClusterSize: ", np.count_nonzero(myKMeans.labels_ == cluster_number))
        clusterCoords = coordinates[myKMeans.labels_ == cluster_number]
        for i in range(np.count_nonzero(myKMeans.labels_ == cluster_number)):
            folium.CircleMarker([clusterCoords[i,0],clusterCoords[i,1]],radius = 2, color="green",fill_opacity=0.9).add_to(mymap)
            folium.CircleMarker([clusterCoords[i,2],clusterCoords[i,3]],radius = 2, color="red",fill_opacity=0.9).add_to(mymap)   

In [171]:
mymap = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
#a = myKMeans.labels_ == 3
#b = coordinates[a]
#print("length: ", np.count_nonzero(myKMeans.labels_ == 3))
#print(myKMeans.labels_.size)
#coordinates[0:10]

In [175]:
#show_cluster()
show_cluster(41)
mymap

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [161]:
dif = np.zeros(100)
def cluster_var(cluster_number):
    clusterCoords = coordinates[myKMeans.labels_ == cluster_number]
    otherCoords = coordinates[myKMeans.labels_ != cluster_number]
    intra_var = clusterCoords.var()
    extra_var = otherCoords.var()
    print("intra varianz of cluster",cluster_number, ':',intra_var)
    print("extra varianz of cluster",cluster_number, ':',extra_var)
    dif[i] = extra_var - intra_var
    
for i in range(100):
    cluster_var(i)
#clusterCoords[0:10]

intra varianz of cluster 0 : 3291.890889083145
extra varianz of cluster 0 : 3290.477374986586
intra varianz of cluster 1 : 3291.060903718253
extra varianz of cluster 1 : 3290.4955702402144
intra varianz of cluster 2 : 3288.333265600144
extra varianz of cluster 2 : 3290.5198452652326
intra varianz of cluster 3 : 3277.76465286699
extra varianz of cluster 3 : 3290.5104354838213
intra varianz of cluster 4 : 3282.7264239562323
extra varianz of cluster 4 : 3290.521029802886
intra varianz of cluster 5 : 3290.687777469302
extra varianz of cluster 5 : 3290.504461661902
intra varianz of cluster 6 : 3282.4659601268604
extra varianz of cluster 6 : 3290.5427566370126
intra varianz of cluster 7 : 3289.8772120253398
extra varianz of cluster 7 : 3290.524855302893
intra varianz of cluster 8 : 3290.8263834971813
extra varianz of cluster 8 : 3290.497560243966
intra varianz of cluster 9 : 3283.809191339881
extra varianz of cluster 9 : 3290.5085443101034
intra varianz of cluster 10 : 3287.7774430091413
ext

intra varianz of cluster 95 : 3292.4889942556547
extra varianz of cluster 95 : 3290.4998638884745
intra varianz of cluster 96 : 3299.75501900125
extra varianz of cluster 96 : 3290.5059571248175
intra varianz of cluster 97 : 3288.6085482806616
extra varianz of cluster 97 : 3290.513205380609
intra varianz of cluster 98 : 3290.6182844897744
extra varianz of cluster 98 : 3290.5041541561905
intra varianz of cluster 99 : 3283.315832733176
extra varianz of cluster 99 : 3290.50920636813


In [165]:
dif.argsort()

array([43, 86, 59, 41, 96, 15, 42, 28, 49, 23, 24, 89, 95, 60, 70, 48, 13,
        0, 85, 14, 54, 65, 74, 50, 21, 53, 55, 68, 63, 46, 93, 67,  1, 91,
       36, 29, 82,  8, 57, 31, 18, 56, 27,  5, 64, 98, 44, 52, 87, 35, 79,
       34, 94, 83,  7, 58, 72, 69, 40, 88, 19, 47, 73, 97, 90, 20, 26,  2,
       38, 11, 10, 17, 77, 62, 39, 30, 61, 75, 66, 33,  9, 99, 16, 76,  4,
        6, 51, 84, 71, 22, 32, 92, 45, 37,  3, 81, 78, 12, 25, 80],
      dtype=int64)