# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

Wall time: 1min 58s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_
print(centers)

[[ 40.76020503 -73.98023098  40.77729031 -73.95473511]
 [ 40.64683298 -73.78497428  40.75012687 -73.9860982 ]
 [ 40.74310979 -73.98586217  40.72572851 -73.98971301]
 [ 40.75062501 -73.97994544  40.71627936 -73.85027159]
 [ 41.366138   -73.137393    41.366138   -73.137393  ]
 [ 40.77081879 -73.86955081  40.75502209 -73.98096271]
 [ 40.74001214 -73.99340282  40.78493045 -73.97676202]
 [ 40.7928477  -73.97084947  40.79590694 -73.96876957]
 [ 40.64665218 -73.78684729  40.65428428 -73.78820135]
 [ 41.366138   -73.137393    40.7498045  -73.9601465 ]
 [ 40.73189804 -73.99586028  40.71382009 -74.00887761]
 [ 40.77546219 -73.95686793  40.78425829 -73.97579544]
 [ 40.76521855 -73.97355692  40.77023251 -73.86944254]
 [ 40.71793667 -73.83710399  40.71819034 -73.8288431 ]
 [ 41.02262825 -73.53695725  41.0471595  -73.47550775]
 [ 40.73406152 -73.99225447  40.65309475 -73.95063107]
 [ 40.76570849 -73.97332744  40.64654139 -73.78526375]
 [ 40.72947314 -73.99770023  40.76002823 -73.97616325]
 [ 40.7608

In [8]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [9]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [10]:
#Train Data
coordinates_ex1 = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [11]:
K_Mean_2=KMeans(n_clusters=clusters, n_jobs=-1)

In [12]:
#Modell trainieren
K_Mean_2.fit(coordinates_ex1.to_numpy()[:100000,:])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [13]:
#mapping
Ex1_pred = K_Mean_2.predict(coordinates_ex1.to_numpy())
print(Ex1_pred)

[58 24 48 ... 38 52 86]


In [14]:
def show_cluster(clustNumb, pred, centers):
    #print(pred)
    #print(clustNumb)
    #print(pred==clustNumb)    
    clust = coordinates_ex1.to_numpy()[pred==clustNumb]
    print(clust)
    mapClust = folium.Map(location = [40.730610,-73.935242],zoom_start = 12)
    einträge = np.shape(clust)[0]
    for i in range(clusters):
        folium.CircleMarker([centers[i,0], centers[i,1]], radius=4,                
                            color="purple", 
                            fill_opacity=0.9
                           ).add_to(mapClust)
        folium.CircleMarker([centers[i,2], centers[i,3]], radius=4,                
                            color="blue", 
                            fill_opacity=0.9
                           ).add_to(mapClust)
    print(einträge)
    return mapClust

In [15]:
show_cluster(70,Ex1_pred,centers)

[[ 40.751662 -73.980002  40.764842 -73.973802]
 [ 40.751902 -73.979815  40.755481 -73.979446]
 [ 40.747858 -73.985382  40.76207  -73.978377]
 ...
 [ 40.738927 -73.987005  40.757217 -73.965767]
 [ 40.7435   -73.972607  40.755248 -73.973562]
 [ 40.744716 -73.985297  40.759496 -73.974518]]
10840


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [16]:
from sklearn import cluster

In [17]:
print(clusters)
print(centers.size)

100
400


In [18]:
#https://stackoverflow.com/questions/58369811/is-there-a-way-to-get-intracluster-distances-for-k-means-in-python

In [19]:
import scipy.spatial
def euclidean_distances(kmean):
        kmean_euc=scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(kmean.T, metric='euclidean'))
        return kmean_euc

In [20]:
dists = euclidean_distances(myKMeans.cluster_centers_)
tri_dists = dists[np.triu_indices(4, 1)]
max_dist, avg_dist, min_dist = tri_dists.max(), tri_dists.mean(), tri_dists.min()

print(max_dist, avg_dist, min_dist)

1147.0401409583349 765.1266850859502 1.3486592262094925


In [26]:
Allcor = coordinates.to_numpy()[:100000,:]
Ex2_pred = K_Mean_2.predict(coordinates_ex1.to_numpy()[:100000,:])

In [27]:
def cluster_var(clustNumb, pred, center):   
    minNP = np.min(clustNumb, axis=0)
    maxNP = np.max(clustNumb, axis=0)
    dist = maxNP - minNP
    for i in range(center):
        clustNP = clustNumb[pred==i]    
        varianNP = np.var(clustNP, axis=0)
        print(i, varianNP)

In [28]:
cluster_var(Allcor, Ex2_pred, 100)

0 [5.06022673e-05 3.05959524e-05 5.91998128e-05 3.70552980e-05]
1 [3.87527235e-05 3.27789624e-05 2.94756506e-05 3.30299744e-05]
2 [0.00024166 0.0001493  0.00024032 0.00030044]
3 [2.01948392e-28 2.01948392e-28 2.01948392e-28 2.01948392e-28]
4 [2.89926924e-05 4.67638230e-05 3.63279566e-05 2.38846037e-05]
5 [6.21397325e-05 1.11697986e-04 1.51424422e-04 1.11211574e-04]
6 [2.36211930e-04 1.73555687e-04 3.62496126e-05 8.37559311e-05]
7 [0.00082689 0.00026562 0.00076233 0.0010178 ]
8 [2.20966273e-05 2.82308900e-05 2.94530961e-05 3.15821180e-05]
9 [5.04870979e-29 2.01948392e-28 5.43847910e-04 2.74371804e-03]
10 [0.00010755 0.00076368 0.00118879 0.00060891]
11 [4.30373207e-05 2.82779924e-05 3.90000446e-05 2.58983762e-05]
12 [0.00026053 0.000136   0.00010425 0.00011458]
13 [0.0068917  0.00292105 0.00753065 0.00463313]
14 [1.27484444e-04 1.13951234e-04 9.82900074e-05 1.02225940e-04]
15 [0.00030084 0.0001988  0.00026113 0.00014932]
16 [2.37033607e-04 1.56403785e-04 6.66074121e-05 6.97239278e-05]
1

In [30]:
cluster_var(Allcor, Ex2_pred, 10)

0 [5.06022673e-05 3.05959524e-05 5.91998128e-05 3.70552980e-05]
1 [3.87527235e-05 3.27789624e-05 2.94756506e-05 3.30299744e-05]
2 [0.00024166 0.0001493  0.00024032 0.00030044]
3 [2.01948392e-28 2.01948392e-28 2.01948392e-28 2.01948392e-28]
4 [2.89926924e-05 4.67638230e-05 3.63279566e-05 2.38846037e-05]
5 [6.21397325e-05 1.11697986e-04 1.51424422e-04 1.11211574e-04]
6 [2.36211930e-04 1.73555687e-04 3.62496126e-05 8.37559311e-05]
7 [0.00082689 0.00026562 0.00076233 0.0010178 ]
8 [2.20966273e-05 2.82308900e-05 2.94530961e-05 3.15821180e-05]
9 [5.04870979e-29 2.01948392e-28 5.43847910e-04 2.74371804e-03]
