# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [4]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [5]:
from sklearn.cluster import KMeans

In [6]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [7]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [8]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [10]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [11]:
train_data = coordinates.to_numpy()[:100000,:]
cluster_predict = myKMeans.predict(train_data)
centers=myKMeans.cluster_centers_

In [12]:
print((cluster_predict.dtype))
print(train_data.shape)
print(centers.shape)

int32
(100000, 4)
(100, 4)


In [13]:
def show_cluster(cluster_number):
    mask = (cluster_predict==cluster_number)
    cluster_data = train_data[mask]
    
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=5,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=5,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        
    for i in range(len(cluster_data)):
        folium.CircleMarker([cluster_data[i,0], cluster_data[i,1]], radius=1,                
                        color="yellow", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        folium.CircleMarker([cluster_data[i,2], cluster_data[i,3]], radius=1,                
                        color="blue", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
                   
    return cluster_map

In [221]:
show_cluster(63)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [274]:
clusters=10
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores
train_data = coordinates.to_numpy()[:100000,:]
myKMeans.fit(train_data)#use only subset of the data to make it faster
cluster_predict = myKMeans.predict(train_data)

In [275]:
import scipy
import matplotlib.pyplot as plt

In [276]:
def cluster_var(cluster_number):
    #Low intra cluster variance
    
    mask = (cluster_predict==cluster_number)
    cluster_data = train_data[mask]
    
    D_intra =  scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(cluster_data, metric='euclidean'))
    
    if len(cluster_data) > 1:
        intraClusterVar = 1/(len(cluster_data)-1) * (D_intra.sum()/2)
    else:
        intraClusterVar = 0
   
    D_extra =  scipy.spatial.distance.cdist(cluster_data, train_data, metric='euclidean')
    extraClusterVar = []
    for c in np.unique(cluster_predict):
        if c == cluster_number:
            continue
            
        maskJ = (cluster_predict==c)
        cluster_distance = D_extra[:, maskJ]
        sum_cluster = cluster_distance.sum()
        extraClusterVar = np.append(extraClusterVar, 1/(len(cluster_distance)) * sum_cluster )

    d = dict();  
    d['intraClusterVar'] = intraClusterVar
    d['extraClusterVar'] = extraClusterVar.min()
    d['silhouette_score'] = (extraClusterVar.min() - intraClusterVar)/ max(extraClusterVar.min(),intraClusterVar) 
    
   
    
    return d 

In [277]:
cluster_var(0)

{'intraClusterVar': 86.9643552219888,
 'extraClusterVar': 50.109175407545905,
 'silhouette_score': -0.42379639014588033}

In [88]:
import sklearn
sklearn.metrics.silhouette_score(train_data,cluster_predict, 'euclidean')

0.2152610769319668