# Exercise: Geographical Cluster Analysis of Taxi Rides 
# By: Christian Wegert
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [18]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [19]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster
#myKMeans.fit(coordinates.to_numpy())



Wall time: 36.9 s


KMeans(n_clusters=100, n_jobs=-1)

In [20]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels = myKMeans.labels_  

In [21]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [22]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [23]:
y_kmeans = myKMeans.predict(coordinates)

In [24]:
coordinates_array = coordinates.to_numpy()


In [25]:
def show_cluster(cluster_number):
    if cluster_number > clusters:
        print("Cluster Number not available")
    elif cluster_number < 0:
        print("Cluster Number not available")
    else:
        print("Cluster Number: %d" %cluster_number)
        cluster_center_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
        #draw all start- and endpoints
        for i in range(coordinates_array[y_kmeans==cluster_number].shape[0]):
            folium.CircleMarker([coordinates_array[y_kmeans==cluster_number][i,0], coordinates_array[y_kmeans==cluster_number][i,1]], radius=3, color="green",fill_opacity=0.2).add_to(cluster_center_map)
            folium.CircleMarker([coordinates_array[y_kmeans==cluster_number][i,2], coordinates_array[y_kmeans==cluster_number][i,3]], radius=3,color="red",fill_opacity=0.2).add_to(cluster_center_map)
        #draw center start- and endpoint
        folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=150, color="green", fill_opacity=0.9).add_to(cluster_center_map)        
        folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=150, color="red", fill_opacity=0.9).add_to(cluster_center_map)
        #draw middle between start- and endpoint (real center?)
        MP_X = (centers[cluster_number,0]+centers[cluster_number,2])/2 #Middlepoint X-coordinates
        MP_Y = (centers[cluster_number,1]+centers[cluster_number,3])/2 #Middlepoint Y-coordinates
        folium.CircleMarker((MP_X, MP_Y), radius=300, color="black", fill_opacity=0.9).add_to(cluster_center_map)
        display(cluster_center_map)

In [28]:
#use function show_cluster(cluster_number)
show_cluster(1)

Cluster Number: 1


# Notes
Red Circle = Dropoff Points

Green Circle = Pickup Point

Black Circle = Middlepoint between Green and Red Circles (basically includes all points!)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [14]:
def cluster_var_extra(cluster_number):
    
    #compute extra variance
    MP_X_1 = (centers[cluster_number,0]+centers[cluster_number,2])/2 #Middlepoint X-coordinates
    MP_Y_1 = (centers[cluster_number,1]+centers[cluster_number,3])/2 #Middlepoint Y-coordinates
    MP1 = np.array([MP_X_1,MP_Y_1])
    MP_X_2 = 0 #Middlepoint X-coordinates
    MP_Y_2 = 0 #Middlepoint Y-coordinates
    Distances_Middle_point = np.zeros(clusters)
    Distances_Pickup_point = np.zeros(clusters)
    Distances_Dropoff_point = np.zeros(clusters)
    
    for i in range(clusters):
        if i != cluster_number:
            MP_X_2 = (centers[i,0]+centers[i,2])/2 
            MP_Y_2 = (centers[i,1]+centers[i,3])/2  
            MP2 = np.array([MP_X_2, MP_Y_2])
            Distances_Middle_point[i]=np.linalg.norm(MP2-MP1)
    Distances__Middle_point_without_cluster_number = np.delete(Distances_Middle_point,cluster_number)
    print("Extra Variance computed with Middlepoint of routes:")
    print(Distances__Middle_point_without_cluster_number.var())
    
    for i in range(clusters):
        if i != cluster_number:  
             Distances_Pickup_point[i]=np.linalg.norm(centers[cluster_number][:2] - centers[i][:2])
    Distances_Pickup_point_without_cluster_number = np.delete(Distances_Pickup_point,cluster_number)
    print("Extra Variance computed with Pickup points:")
    print(Distances_Pickup_point_without_cluster_number.var())
    
    
    for i in range(clusters):
        if i != cluster_number:  
             Distances_Dropoff_point[i]=np.linalg.norm(centers[cluster_number][2:] - centers[i][2:])
    Distances_Dropoff_point_without_cluster_number = np.delete(Distances_Dropoff_point,cluster_number)
    print("Extra Variance computed with Dropoff points:")
    print(Distances_Dropoff_point_without_cluster_number.var())

In [15]:
cluster_var_extra(2)

Extra Variance computed with Middlepoint of routes:
0.0477875993269144
Extra Variance computed with Pickup points:
0.055760755223731154
Extra Variance computed with Dropoff points:
0.03731388566455334


In [16]:
def cluster_var_intra(cluster_number):
    
    #compute intra variance
    MP_X_1 = (centers[cluster_number,0]+centers[cluster_number,2])/2 #Middlepoint 1 X-coordinates (Cluster Middlepoint)
    MP_Y_1 = (centers[cluster_number,1]+centers[cluster_number,3])/2 #Middlepoint 1 Y-coordinates (Cluster Middlepoint)
    MP1 = np.array([MP_X_1,MP_Y_1])
    Distances_Middle_point = np.zeros(coordinates_array[y_kmeans==cluster_number].shape[0])
    Distances_Pickup_point = np.zeros(coordinates_array[y_kmeans==cluster_number].shape[0])
    Distances_Dropoff_point = np.zeros(coordinates_array[y_kmeans==cluster_number].shape[0])
    MP_X_2 = 0 #Middlepoint 2 X-coordinates (Coordinates Middlepoint)
    MP_Y_2 = 0 #Middlepoint 2 Y-coordinates (Coordinates Middlepoint)
    
    for i in range(coordinates_array[y_kmeans==cluster_number].shape[0]):
                MP_X_2 = (coordinates_array[y_kmeans==cluster_number][i,0]+coordinates_array[y_kmeans==cluster_number][i,2])/2 
                MP_Y_2 = (coordinates_array[y_kmeans==cluster_number][i,1]+coordinates_array[y_kmeans==cluster_number][i,3])/2  
                MP2 = np.array([MP_X_2, MP_Y_2])
                Distances_Middle_point[i]=np.linalg.norm(MP2-MP1)
    Distances__Middle_point_without_cluster_number = np.delete(Distances_Middle_point,cluster_number)
    print("Intra Variance computed with Middlepoint of routes:")
    print(Distances__Middle_point_without_cluster_number.var())
    
    for i in range(coordinates_array[y_kmeans==cluster_number].shape[0]):  
             Distances_Pickup_point[i]=np.linalg.norm(centers[cluster_number][:2] - coordinates_array[y_kmeans==cluster_number][i][:2])
    Distances_Pickup_point_without_cluster_number = np.delete(Distances_Pickup_point,cluster_number)
    print("Intra Variance computed with Pickup points:")
    print(Distances_Pickup_point_without_cluster_number.var())
    
    for i in range(coordinates_array[y_kmeans==cluster_number].shape[0]):  
             Distances_Pickup_point[i]=np.linalg.norm(centers[cluster_number][2:] - coordinates_array[y_kmeans==cluster_number][i][2:])
    Distances_Dropoff_point_without_cluster_number = np.delete(Distances_Pickup_point,cluster_number)
    print("Intra Variance computed with Dropoff points:")
    print(Distances_Dropoff_point_without_cluster_number.var())


In [17]:
cluster_var_intra(2)

Intra Variance computed with Middlepoint of routes:
0.0003469084121368196
Intra Variance computed with Pickup points:
0.00016874327722649556
Intra Variance computed with Dropoff points:
0.0012682585991394683


# Summary

## k = 10 (Compute for cluster 2)

## Extra:

Extra Variance computed with Middlepoint of routes:
0.0477875993269144

Extra Variance computed with Pickup points:
0.055760755223731154

Extra Variance computed with Dropoff points:
0.03731388566455334

## Intra
Intra Variance computed with Middlepoint of routes:
0.0003469084121368196

Intra Variance computed with Pickup points:
0.00016874327722649556

Intra Variance computed with Dropoff points:
0.0012682585991394683

## k = 100 (Compute for cluster 2)

## Extra:
Extra Variance computed with Middlepoint of routes:
0.02190840052709791

Extra Variance computed with Pickup points:
0.03323136224119104

Extra Variance computed with Dropoff points:
0.026169866932606797

## Intra
Intra Variance computed with Middlepoint of routes:
2.1522962874796834e-05

Intra Variance computed with Pickup points:
6.25531947910461e-05

Intra Variance computed with Dropoff points:
3.864982894438022e-05



# Result:
The results with k = 10 are worse (higher varaince), as expected!

With k = 100, we have More Cluster -> Less Variance -> Better Results!