# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium
from typing import Tuple


In [2]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [4]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]
coordinates

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.841610
1,40.711303,-74.016048,40.782004,-73.979268
2,40.761270,-73.982738,40.750562,-73.991242
3,40.733143,-73.987130,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655
...,...,...,...,...
399995,40.746032,-73.986585,40.724077,-73.990865
399996,40.742359,-73.992882,40.762318,-73.972649
399997,40.731558,-73.985598,40.728738,-73.987657
399998,40.740735,-74.007692,40.722847,-73.988455


## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [5]:
from sklearn.cluster import KMeans

In [6]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [7]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [8]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [13]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [13]:
test =  pd.read_csv('../DATA/NY_taxi_test.csv')
test = test[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]


In [28]:
#show_cluster

def get_clusters(cluster_map: pd.DataFrame, X_test: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, pd.DataFrame]:

    clustering = KMeans(n_clusters=n_clusters, random_state=8675309,n_jobs=-1)
    clustering.fit(cluster_map)
    # apply the labels
    train_labels = clustering.labels_
    X_train_clstrs = cluster_map.copy()
    X_train_clstrs['clusters'] = train_labels
    
    centers=clustering.cluster_centers_
    
#     # predict labels on the test set
#     test_labels = clustering.predict(X_test)
#     X_test_clstrs = X_test.copy()
#     X_test_clstrs['clusters'] = test_labels
    
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    for i in range(100):
        folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

        folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                            color="green", 
                            fill_opacity=0.9
                           ).add_to(cluster_map)
        folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                            color="red", 
                            fill_opacity=0.9
                           ).add_to(cluster_map)
    
    return X_train_clstrs, X_test_clstrs

In [29]:
cluster_map_clstrs, X_test_clstrs = get_clusters(coordinates, test, 53)

IndexError: index 53 is out of bounds for axis 0 with size 53

In [28]:
from sklearn.preprocessing import StandardScaler
def scale_features(cluster_map: pd.DataFrame, X_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:

    scaler = StandardScaler()
    to_scale = [col for col in cluster_map.columns.values]
    scaler.fit(cluster_map[to_scale])
    cluster_map[to_scale] = scaler.transform(cluster_map[to_scale])
    
    # predict z-scores on the test set
    X_test[to_scale] = scaler.transform(X_test[to_scale])
    
    return cluster_map, X_test

In [29]:
cluster_map_scaled, X_test_scaled = scale_features(cluster_map_clstrs, X_test_clstrs)

In [30]:
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([X_test_clstrs[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([X_test_clstrs[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [X_test_clstrs[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

KeyError: (0, 0)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.