# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster



Wall time: 2min 20s


KMeans(n_clusters=100, n_jobs=-1)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_

In [8]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [9]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [10]:
def show_cluster(cluster_number, train_data, coordinates, centers):
    # Get classification of samples
    train_predict = train_data.predict(coordinates.to_numpy()[:100000,:])
    
    # Filter array to only have the index equal to cluster number
    cluster_index = np.where(train_predict == cluster_number)
    
    # Show circles
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    for i in np.nditer(cluster_index):
        folium.CircleMarker([coordinates.loc[i].pickup_latitude, coordinates.loc[i].pickup_longitude], radius=3,                
                            color="green", 
                            fill_opacity=0.9
                            ).add_to(cluster_map)
        folium.CircleMarker([coordinates.loc[i].dropoff_latitude, coordinates.loc[i].dropoff_longitude], radius=3,                
                            color="red", 
                            fill_opacity=0.9
                            ).add_to(cluster_map)
    
    # Show centers
    folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=5,                
                        color="black", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=5,                
                        color="black", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    return cluster_map

In [11]:
show_cluster(0, myKMeans, coordinates, centers)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [12]:
from sklearn import metrics

In [13]:
def cluster_var(cluster_number, train_data, coordinates, centers):
        
    # Get classification of samples
    train_predict = train_data.predict(coordinates.to_numpy()[:100000,:])

    # Filter array to only have the index equal to cluster number
    cluster_index = np.where(train_predict == cluster_number)
        
    pickup_sum = 0
    dropoff_sum = 0
        
    # Iterate over the array to have the sum of squared distance between the points and the centers
    for i in np.nditer(cluster_index):
        pickup_sum += np.linalg.norm(np.array([coordinates.loc[i].pickup_latitude, coordinates.loc[i].pickup_longitude]) - np.array([centers[cluster_number, 0],centers[cluster_number, 1]])) ** 2
        dropoff_sum += np.linalg.norm(np.array([coordinates.loc[i].dropoff_latitude, coordinates.loc[i].dropoff_longitude]) - np.array([centers[cluster_number, 2],centers[cluster_number, 3]])) ** 2
    
    # Divide by the number of elements
    intraVariance_pickup = pickup_sum/cluster_index[0].shape
    intraVariance_dropoff = dropoff_sum/cluster_index[0].shape
    intraVariance_pickup = intraVariance_pickup[0]
    intraVariance_dropoff = intraVariance_dropoff[0]
    
    # Get the Mean value between both intraVariance of the Cluster
    intraVariance = (intraVariance_pickup+intraVariance_dropoff)/2
    
    # Compute the distance between both centers (pickup center and dropoff center)
    centers_distance = np.linalg.norm(np.array([centers[cluster_number,0],centers[cluster_number,1]]-np.array([centers[cluster_number,2],centers[cluster_number,3]])))
    
    return intraVariance, centers_distance

In [14]:
intraVariance_Tot = 0
extraVariance_Tot = np.array([])

# Compute the cluster variances for every clusters
for i in range(clusters):
    variance = cluster_var(i, myKMeans, coordinates,centers)
    intraVariance_Tot += variance[0]
    extraVariance_Tot = np.append(extraVariance_Tot, variance[1])

In [15]:
print("Mean Value of intra Variance for 100 Clusters : ", intraVariance_Tot/clusters)
print("Extra Cluster Variance for 100 Clusters : ", np.var(extraVariance_Tot))

Mean Value of intra Variance for 100 Clusters :  0.0012060831789741586
Extra Cluster Variance for 100 Clusters :  0.02888373234527943


## Test for 10 Clusters

In [16]:
# Set clusters numbers on 10 and recreate KMeans
clusters=10
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [17]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster



Wall time: 9.16 s


KMeans(n_clusters=10, n_jobs=-1)

In [18]:
#get cluster centers
centers=myKMeans.cluster_centers_

In [19]:
# Show map of cluster
show_cluster(0, myKMeans, coordinates, centers)

In [20]:
intraVariance_Tot = 0
extraVariance_Tot = np.array([])

# Compute the cluster variances for every clusters
for i in range(clusters):
    variance = cluster_var(i, myKMeans, coordinates,centers)
    intraVariance_Tot += variance[0]
    extraVariance_Tot = np.append(extraVariance_Tot, variance[1])

In [21]:
print("Mean Value of intra Variance for 10 Clusters : ", intraVariance_Tot/clusters)
print("Extra Cluster Variance for 10 Clusters : ", np.var(extraVariance_Tot))

Mean Value of intra Variance for 10 Clusters :  0.015076266553934842
Extra Cluster Variance for 10 Clusters :  0.011351544202196544


The results shows that the Intra Variance is higher for 10 Clusters than 100 clusters.
This means that when we only have 10 clusters, these are larger.
This behaviour is normal because more data are aggregated with only 10 clusters.

Contrary, the Extra Cluster Variance is lower with 10 Clusters than 100 clusters.
This means that when we only have 10 clusters, these are closer in between than 100 clusters.
This behaviour is normal because there are less chances to have cluster with high distance between the centers.