# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [2]:
import pandas as pd
import numpy as np
import folium


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [4]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [5]:
from sklearn.cluster import KMeans

In [6]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [7]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

Wall time: 2min 1s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [8]:
#get cluster centers
centers=myKMeans.cluster_centers_
print(centers)

[[ 40.74457858 -73.98349498  40.75547183 -73.9740478 ]
 [ 40.77580882 -73.95701903  40.78426995 -73.9757661 ]
 [ 40.64521873 -73.78248183  40.64258662 -73.95216067]
 [ 41.366138   -73.137393    41.366138   -73.137393  ]
 [ 40.72759042 -73.9967709   40.68401658 -73.98380773]
 [ 40.74947141 -73.97989386  40.71334778 -73.83845845]
 [ 40.77986899 -73.95090316  40.79883587 -73.94448086]
 [ 40.75681951 -73.98002072  40.77130577 -73.86849662]
 [ 40.80955887 -73.86537031  40.86408847 -73.86789117]
 [ 41.366138   -73.137393    40.7498045  -73.9601465 ]
 [ 40.75414649 -73.97698166  40.72701841 -73.98875908]
 [ 40.64668951 -73.78690061  40.65332601 -73.78840602]
 [ 40.77134696 -73.8705723   40.78887469 -73.95661537]
 [ 40.77105935 -73.86918738  40.75302814 -73.98222003]
 [ 40.64677103 -73.78548561  40.75429441 -73.9830786 ]
 [ 40.74697617 -73.98815017  40.77446329 -73.95539671]
 [ 40.78721777 -73.9633468   40.84171397 -73.93421782]
 [ 40.75111521 -73.98687285  40.68938503 -74.17812481]
 [ 40.7281

In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [10]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [36]:
#Train Data
coordinates_ex1 = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [73]:
K_Mean_2=KMeans(n_clusters=800, n_jobs=-1)

In [None]:
#Modell trainieren
K_Mean_2.fit(coordinates_ex1.to_numpy()[:100000,:])

In [None]:
#mapping
Ex1_pred = K_Mean_2.predict(coordinates_ex1.to_numpy())
print(Ex1_pred)

In [None]:
def show_cluster(clustNumb, pred, centers):
    #print(pred)
    #print(clustNumb)
    #print(pred==clustNumb)    
    clust = coordinates_ex1.to_numpy()[pred==clustNumb]
    #print(clust)
    mapClust = folium.Map(location = [40.730610,-73.935242],zoom_start = 12)
    einträge = np.shape(clust)[0]
    print(einträge)
    for i in range(einträge):
        folium.CircleMarker([centers[i,0], centers[i,1]], radius=4,                
                            color="purple", 
                            fill_opacity=0.9
                           ).add_to(mapClust)
        folium.CircleMarker([centers[i,2], centers[i,3]], radius=4,                
                            color="blue", 
                            fill_opacity=0.9
                           ).add_to(mapClust)
        folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(mapClust)
    print(type(einträge))
    return mapClust

In [None]:
show_cluster(60,Ex1_pred,centers)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [17]:
from sklearn import cluster

In [18]:
print(clusters)
print(centers.size)

100
400


In [19]:
#https://stackoverflow.com/questions/58369811/is-there-a-way-to-get-intracluster-distances-for-k-means-in-python

In [20]:
import scipy.spatial
def euclidean_distances(kmean):
        kmean_euc=scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(kmean.T, metric='euclidean'))
        return kmean_euc

In [21]:
dists = euclidean_distances(myKMeans.cluster_centers_)
tri_dists = dists[np.triu_indices(4, 1)]
max_dist, avg_dist, min_dist = tri_dists.max(), tri_dists.mean(), tri_dists.min()

print(max_dist, avg_dist, min_dist)

1146.981538747813 765.0694139716857 1.326717478248975


In [22]:
Allcor = coordinates.to_numpy()[:100000,:]
Ex2_pred = K_Mean_2.predict(coordinates_ex1.to_numpy()[:100000,:])

In [23]:
def cluster_var(clustNumb, pred, center):   
    minNP = np.min(clustNumb, axis=0)
    maxNP = np.max(clustNumb, axis=0)
    dist = maxNP - minNP
    for i in range(center):
        clustNP = clustNumb[pred==i]    
        varianNP = np.var(clustNP, axis=0)
        print(i, varianNP)

In [24]:
cluster_var(Allcor, Ex2_pred, 100)

0 [7.09602302e-05 3.57988160e-05 5.49155646e-05 3.71068940e-05]
1 [2.99950791e-05 3.72970743e-05 4.45282063e-05 4.96155205e-05]
2 [6.80781700e-06 4.53357834e-05 9.24771345e-04 1.36247065e-03]
3 [7.75345855e-05 6.22794192e-05 4.69049618e-05 6.35232550e-05]
4 [2.01948392e-28 2.01948392e-28 2.01948392e-28 2.01948392e-28]
5 [4.12142171e-05 4.20204786e-05 3.79983615e-05 3.96065532e-05]
6 [2.64596142e-05 2.54090296e-05 2.83537493e-05 3.24890700e-05]
7 [1.14184879e-04 8.97516433e-05 6.01033052e-04 5.77857645e-04]
8 [0.00028013 0.00012183 0.00013492 0.00025163]
9 [5.04870979e-29 2.01948392e-28 5.43847910e-04 2.74371804e-03]
10 [6.71195653e-05 6.47969003e-05 2.95591693e-04 3.55801381e-04]
11 [0.00031202 0.00035554 0.0002643  0.0003664 ]
12 [1.14029702e-04 1.06424786e-04 7.50337069e-05 8.49513656e-05]
13 [3.09086803e-05 4.57033277e-05 3.50414994e-05 2.47014222e-05]
14 [3.15797013e-05 4.23743406e-05 5.92550407e-05 5.85583266e-05]
15 [2.19304772e-04 9.99362078e-05 1.79045310e-04 1.35085785e-04]
16

In [25]:
cluster_var(Allcor, Ex2_pred, 10)

0 [7.09602302e-05 3.57988160e-05 5.49155646e-05 3.71068940e-05]
1 [2.99950791e-05 3.72970743e-05 4.45282063e-05 4.96155205e-05]
2 [6.80781700e-06 4.53357834e-05 9.24771345e-04 1.36247065e-03]
3 [7.75345855e-05 6.22794192e-05 4.69049618e-05 6.35232550e-05]
4 [2.01948392e-28 2.01948392e-28 2.01948392e-28 2.01948392e-28]
5 [4.12142171e-05 4.20204786e-05 3.79983615e-05 3.96065532e-05]
6 [2.64596142e-05 2.54090296e-05 2.83537493e-05 3.24890700e-05]
7 [1.14184879e-04 8.97516433e-05 6.01033052e-04 5.77857645e-04]
8 [0.00028013 0.00012183 0.00013492 0.00025163]
9 [5.04870979e-29 2.01948392e-28 5.43847910e-04 2.74371804e-03]
