# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [0]:
import pandas as pd
import numpy as np
import folium


In [0]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: True
Cloning into 'DataScienceSS20'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 450 (delta 1), reused 2 (delta 0), pack-reused 444[K
Receiving objects: 100% (450/450), 99.60 MiB | 23.24 MiB/s, done.
Resolving deltas: 100% (173/173), done.
Checking out files: 100% (186/186), done.


In [0]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [0]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [0]:
from sklearn.cluster import KMeans

In [0]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [0]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [0]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [0]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [0]:
from copy import deepcopy

df = deepcopy(pd.DataFrame(coordinates)) 
a = myKMeans.predict(coordinates)  
df['center'] = a
df

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,center
0,40.721319,-73.844311,40.712278,-73.841610,96
1,40.711303,-74.016048,40.782004,-73.979268,49
2,40.761270,-73.982738,40.750562,-73.991242,83
3,40.733143,-73.987130,40.758092,-73.991567,65
4,40.768008,-73.968095,40.783762,-73.956655,41
...,...,...,...,...,...
399995,40.746032,-73.986585,40.724077,-73.990865,52
399996,40.742359,-73.992882,40.762318,-73.972649,11
399997,40.731558,-73.985598,40.728738,-73.987657,55
399998,40.740735,-74.007692,40.722847,-73.988455,75


In [0]:
def show_cluster(cluster_number):
  cluster_map2 = folium.Map(location = [40.730610,-73.935242],zoom_start = 12)

  filterd = df.loc[df['center'] == cluster_number]
  for i in range(len(filterd)):
    point = filterd.iloc[i]
    folium.CircleMarker([point[0], point[1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                      ).add_to(cluster_map2)
    folium.CircleMarker([point[2], point[3]], radius=3,                
                      color="red", 
                      fill_opacity=0.9
                      ).add_to(cluster_map2)
  return cluster_map2

In [0]:
clu=show_cluster(99)
clu

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [0]:
def cluster_var(cluster_number):
  points = df.loc[df['center'] == cluster_number]
  between_min=1000
  start_mean=0
  end_mean=0
  start_dist=0
  end_dist=0
  between_dist=0
  for i in range(len(points)):
    p0 = points.iloc[i]
    
    between_dist = between_dist + between_distance(p0,p0)
   
    j=i+1
  
    for j in range(len(points)):
      p1= points.iloc[j]
      start_dist = start_dist + start_distance(p0,p1)
      end_dist = end_dist + end_distance(p0,p1)
      between_dist = between_dist + between_distance(p0,p1)

    between_temp_min=1/(len(points))*between_dist

    if(between_min>between_temp_min):
      between_min=between_temp_min

    start_mean=start_mean + start_dist*(1/(len(points)-1))
    end_mean=end_mean + end_dist*(1/(len(points)-1))

  start_intra = start_mean/len(points)
  end_intra = end_mean/len(points)


  print('Start Intra: ', start_intra)
  print('End Intra: ', end_intra)
  print('Extra Cluster: ', between_min)

def start_distance(p0,p1):
  return (p0[0] - p1[0])**2 + (p0[1] - p1[1])**2
def end_distance(p0,p1):
  return (p0[2] - p1[2])**2 + (p0[3] - p1[3])**2
def between_distance(p0,p1):
  return (p0[0] - p1[2])**2 + (p0[1] - p1[3])**2


cluster_var(99)

Start Intra:  0.39891791837580126
End Intra:  0.2511423225264771
Extra Cluster:  0.005495381580463769


In [0]:
#Solution 

def show_var(predictions, k):
    #get coordinates of the target cluster
    #coordinates were all coordinate collumns from the train data
    np_coord = coordinates.to_numpy()
    
    #compute min and max coordiantes for normalization
    min_cor = np.min(np_coord, axis=0)
    max_cor = np.max(np_coord, axis=0)
    dist = max_cor - min_cor
    print ("max distances:", dist)
    for i in range(k):
        cluster_coord = np_coord[predictions==i]
    
        #compute variances
        var_all = np.var(cluster_coord, axis=0)
        print(i, var_all)

In [0]:
show_var(predictions,100)