# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [2]:

!pip install folium

    

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.9 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [91]:
import folium
import pandas as pd
import numpy as np
from itertools import combinations

In [4]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [5]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [6]:
from sklearn.cluster import KMeans

In [7]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [8]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

CPU times: user 48.2 ms, sys: 43.2 ms, total: 91.4 ms
Wall time: 44.3 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [9]:
#get cluster centers
centers=myKMeans.cluster_centers_

In [10]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [11]:
#cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [13]:
def show_cluster(cluster_number,train_data):

    cluster_affiliation = myKMeans.predict(train_data)
    coords_of_cluster = coordinates[cluster_affiliation==cluster_number].to_numpy()

    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    for i in range(len(coords_of_cluster)):
        folium.CircleMarker([coords_of_cluster[i,0], coords_of_cluster[i,1]], radius=3,                
                            color="green", 
                            fill_opacity=0.9
                           ).add_to(cluster_map)
        folium.CircleMarker([coords_of_cluster[i,2], coords_of_cluster[i,3]], radius=3,                
                            color="red", 
                            fill_opacity=0.9
                           ).add_to(cluster_map)
    #folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=10,                
                        color="black", 
                        fill_opacity=0.9,
                        tooltip='pick up'
                        ).add_to(cluster_map)
    folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=10,                
                        color="black", 
                        fill_opacity=0.9,
                        tooltip='drop off'
                        ).add_to(cluster_map)
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    return cluster_map

In [15]:
map = show_cluster(1,coordinates)
#map


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [186]:
def UseKMeans(k,train_data):
    
    myKMeans=KMeans(n_clusters=k, n_jobs=-1)
    myKMeans.fit(train_data.to_numpy()[:100000,:])
    centers=myKMeans.cluster_centers_
    cluster_affiliation = myKMeans.predict(train_data)
    return centers,cluster_affiliation

In [224]:
def IntraVar(center,cluster_points):
    dist = np.linalg.norm(cluster_points-center,axis=1)
    sum_dist = np.sum(dist)
    number = len(cluster_points)-1 if len(cluster_points)-1 > 0 else 1
    IVar = sum_dist/(number)
    return IVar

In [188]:
# berechnet die Extra Varainz durch Vraianzberechnung zwischen den Clustercenter
def ExtraVar(centers):
    comb = combinations(range(len(centers)),2)
    dist = [np.linalg.norm(centers[pair[0]]-centers[pair[1]]) for pair in comb]
    sum_dist = np.sum(dist)
    EVar = sum_dist/(len(centers)-1)
    #print("die externe Varianz beträgt",EVar)
    return EVar

In [189]:
def cluster_var(cluster_number, centers, cluster_affiliation, coordinates):
    IVar = IntraVar(centers[cluster_number],coordinates[cluster_affiliation == cluster_number])
    EVar = ExtraVar(centers)
    return IVar,EVar

In [199]:
IVar,EVar = cluster_var(2, centers, cluster_affiliation, coordinates)
print("Intra Varianz = ",IVar, "\nExtra Varianz = ", EVar)

Intra Varianz =  0.09957918139816364 
Extra Varianz =  1.8247962514721128


In [205]:
#to compare all intra variances of all clusters, the mean of all is taken.
def analyze_k(k,coordinates):
    centers,cluster_affiliation = UseKMeans(k,coordinates)
    IVars=[IntraVar(centers[cluster_number],coordinates[cluster_affiliation == cluster_number]) for cluster_number in range(k)]
    print("mean of all Intravariances of all Clusters:",np.mean(IVars),"\n Extravariance: ", ExtraVar(centers))


In [206]:
analyze_k(10,coordinates)

mean of all Intravariances of all Clusters: 0.08864740421422548 
 Extravariance:  1.825684001952676


In [208]:
analyze_k(20,coordinates)

mean of all Intravariances of all Clusters: 0.056051711455497054 
 Extravariance:  4.0956105899856885


In [209]:
analyze_k(50,coordinates)

mean of all Intravariances of all Clusters: 0.05379471017748805 
 Extravariance:  9.335844368300728


In [226]:
analyze_k(100,coordinates)

mean of all Intravariances of all Clusters: 0.03876792303643225 
 Extravariance:  16.134077158214325


***Intra cluster variance is growing and extra cluster variance is decreasing with higher k. But a higher k is also less generalizing the results.***