### THE BOSTON MARATHON

### Clustering Challenge

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans
%matplotlib inline

In [2]:
data = pd.read_csv('./results.csv')

In [3]:
pd.set_option('display.max_columns', 16164)
data.head()

Unnamed: 0,25k,age,name,division,10k,gender,half,official,bib,ctz,country,overall,pace,state,30k,5k,genderdiv,20k,35k,city,40k
0,49.87,28,"Cassidy, Josh R.",9,18.18,M,40.93,90.9,W1,,CAN,9,3.47,ON,62.07,8.9,9,38.8,74.73,Toronto,85.55
1,77.27,30,"Korir, Wesley",5,30.9,M,64.9,132.5,1,,KEN,5,5.07,,92.97,15.9,5,61.52,108.78,Kenya,124.77
2,77.23,23,"Desisa, Lelisa",1,30.9,M,64.92,130.37,2,,ETH,1,4.98,,92.72,15.93,1,61.53,108.68,Ambo,123.78
3,50.5,32,"Fearnley, Kurt H.",5,18.73,M,42.0,88.43,W2,,AUS,5,3.38,,61.35,8.98,5,39.88,73.0,Hamilton,83.43
4,48.75,39,"Hokinoue, Kota",3,18.18,M,40.57,87.22,W3,,JPN,3,3.33,,59.92,8.92,3,38.55,71.68,Iizuka,81.88


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16164 entries, 0 to 16163
Data columns (total 21 columns):
25k          16164 non-null object
age          16164 non-null int64
name         16164 non-null object
division     16164 non-null int64
10k          16164 non-null object
gender       16164 non-null object
half         16164 non-null object
official     16164 non-null float64
bib          16164 non-null object
ctz          757 non-null object
country      16164 non-null object
overall      16164 non-null int64
pace         16164 non-null float64
state        14701 non-null object
30k          16164 non-null object
5k           16164 non-null object
genderdiv    16164 non-null int64
20k          16164 non-null object
35k          16164 non-null object
city         16163 non-null object
40k          16164 non-null object
dtypes: float64(2), int64(4), object(15)
memory usage: 2.6+ MB


In [25]:
pd.set_option('display.max_rows', 16164)
data[['ctz', 'country', 'state']].head()

Unnamed: 0,ctz,country,state
0,,CAN,ON
1,,KEN,Not USA or canada
2,,ETH,Not USA or canada
3,,AUS,Not USA or canada
4,,JPN,Not USA or canada


There is no clear pattern of missiness between 'ctz' and the other logically related features such as 'country' and state.But considering the large number of missing value in 'ctz',it is better to remove that feature.Looking in to the 'state' and 'country' features we can understand that the 'state' has values when country is either USA or Canada but it is 'NaN' when it is otherwise.therefore i decided to fill new values to the 'NaN's as 'Not USA or Canada'.Lets Fill NaNs on the states attribute with some use full information than droping it.The data points in our data set consists of over 90% of the runners are from the USA and Canada.Only around 9% are from outside USA and Canada.

In [6]:
data['state'].fillna('Not USA or canada', inplace = True)

Since the 'ctz' attribute has large number of NaN values lets drop it.

In [7]:
new_data = data.drop('ctz', axis = 1)

In [8]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16164 entries, 0 to 16163
Data columns (total 20 columns):
25k          16164 non-null object
age          16164 non-null int64
name         16164 non-null object
division     16164 non-null int64
10k          16164 non-null object
gender       16164 non-null object
half         16164 non-null object
official     16164 non-null float64
bib          16164 non-null object
country      16164 non-null object
overall      16164 non-null int64
pace         16164 non-null float64
state        16164 non-null object
30k          16164 non-null object
5k           16164 non-null object
genderdiv    16164 non-null int64
20k          16164 non-null object
35k          16164 non-null object
city         16163 non-null object
40k          16164 non-null object
dtypes: float64(2), int64(4), object(14)
memory usage: 2.5+ MB


The 'city' variable has only one missing value and removing that row will not have a significant information loss.

In [9]:
new_data[new_data['city'].isnull()].index

Int64Index([10138], dtype='int64')

In [10]:
new_data = new_data.drop(10138, axis = 0)

In [11]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16163 entries, 0 to 16163
Data columns (total 20 columns):
25k          16163 non-null object
age          16163 non-null int64
name         16163 non-null object
division     16163 non-null int64
10k          16163 non-null object
gender       16163 non-null object
half         16163 non-null object
official     16163 non-null float64
bib          16163 non-null object
country      16163 non-null object
overall      16163 non-null int64
pace         16163 non-null float64
state        16163 non-null object
30k          16163 non-null object
5k           16163 non-null object
genderdiv    16163 non-null int64
20k          16163 non-null object
35k          16163 non-null object
city         16163 non-null object
40k          16163 non-null object
dtypes: float64(2), int64(4), object(14)
memory usage: 2.6+ MB


In [12]:
new_data.head(40)

Unnamed: 0,25k,age,name,division,10k,gender,half,official,bib,country,overall,pace,state,30k,5k,genderdiv,20k,35k,city,40k
0,49.87,28,"Cassidy, Josh R.",9,18.18,M,40.93,90.9,W1,CAN,9,3.47,ON,62.07,8.9,9,38.8,74.73,Toronto,85.55
1,77.27,30,"Korir, Wesley",5,30.9,M,64.9,132.5,1,KEN,5,5.07,Not USA or canada,92.97,15.9,5,61.52,108.78,Kenya,124.77
2,77.23,23,"Desisa, Lelisa",1,30.9,M,64.92,130.37,2,ETH,1,4.98,Not USA or canada,92.72,15.93,1,61.53,108.68,Ambo,123.78
3,50.5,32,"Fearnley, Kurt H.",5,18.73,M,42.0,88.43,W2,AUS,5,3.38,Not USA or canada,61.35,8.98,5,39.88,73.0,Hamilton,83.43
4,48.75,39,"Hokinoue, Kota",3,18.18,M,40.57,87.22,W3,JPN,3,3.33,Not USA or canada,59.92,8.92,3,38.55,71.68,Iizuka,81.88
5,77.25,28,"Gebremariam, Gebregziabher",3,30.88,M,64.92,130.47,3,ETH,3,4.98,Not USA or canada,92.72,15.93,3,61.53,108.7,Tigray,123.78
6,50.5,42,"Soejima, Masazumi",6,18.73,M,41.98,90.02,W4,JPN,6,3.43,Not USA or canada,61.63,8.98,6,39.87,74.0,Fukuoka,84.67
7,77.23,28,"Geneti, Markos",6,30.88,M,64.9,132.73,4,ETH,6,5.07,Not USA or canada,92.93,15.9,6,61.52,109.07,Addis Ababa,125.2
8,52.62,49,"Schabort, Krige",11,19.77,M,43.97,91.78,W5,USA,11,3.5,GA,64.0,9.38,11,41.77,75.95,Cedartown,86.48
9,77.25,32,"Merga, Deriba",25,30.9,M,64.92,141.67,5,ETH,27,5.42,Not USA or canada,92.77,15.93,27,61.53,111.13,Addis Ababa,131.98


In [13]:
new_data.nunique()

25k           3245
age             61
name         16136
division      3698
10k           1459
gender           2
half          2761
official      5370
bib          16156
country         68
overall      16106
pace           362
state           66
30k           3862
5k             808
genderdiv     9811
20k           2644
35k           4539
city          4416
40k           5136
dtype: int64

from the data info,we can see that majority of the attributes are categorical(Objects).In real terms however that is not the case.Therefore we need to change the type to numerical in order to perform proper analysis.But lets remove first the '-' character from the data.

In [14]:
Columns = ['25k', '10k', 'half', '30k', '5k', '20k', '35k', '40k', 'age']
for col in Columns:
    numeric_data = new_data.drop(new_data[new_data[col] == '-'].index)
numeric_data = numeric_data[Columns].apply(pd.to_numeric, errors = 'coerce') 
numeric_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16163 entries, 0 to 16163
Data columns (total 9 columns):
25k     16140 non-null float64
10k     16137 non-null float64
half    16147 non-null float64
30k     16146 non-null float64
5k      16097 non-null float64
20k     16146 non-null float64
35k     16142 non-null float64
40k     16146 non-null float64
age     16163 non-null int64
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


  result = method(y)


Now lets fill NaN values

In [16]:
for feature in numeric_data.columns:
    numeric_data[feature] = numeric_data.groupby(['age'], sort=False)[feature].apply(lambda x: x.fillna(x.mean()))

In [17]:
dummied_cols = ['country','state','gender']
for column in dummied_cols:
    numeric_data = pd.concat([numeric_data, (pd.get_dummies(new_data[column]))], axis = 1)

Lets concat the other numerical coulumns;

In [18]:
other_cols = new_data[['division', 'official','pace', 'overall', 'genderdiv', 'age']]
numeric_data = pd.concat([numeric_data,other_cols], axis = 1)
numeric_data.head()

Unnamed: 0,25k,10k,half,30k,5k,20k,35k,40k,age,ARG,ARU,AUS,AUT,BAH,BEL,BER,BOL,BRA,CAN,CAY,CHI,CHN,COL,CRC,CZE,DEN,DOM,ECU,ESA,ESP,EST,ETH,FIN,FRA,GBR,GER,GRE,GUA,HKG,HUN,IND,IRL,ISL,ISR,ITA,JPN,KEN,KOR,LTU,MAS,MEX,NED,NOR,NZL,PAN,PER,PHI,POL,POR,QAT,RSA,RUS,SIN,SLO,SUI,SVK,SWE,TCA,TRI,TUR,TWN,UAE,UKR,URU,USA,VEN,VGB,AB,AE,AK,AL,AP,AR,AZ,BC,CA,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MB,MD,ME,MI,MN,MO,MS,MT,NB,NC,ND,NE,NH,NJ,NL,NM,NS,NV,NY,Not USA or canada,OH,OK,ON,OR,PA,PE,PR,QC,RI,SC,SD,SK,TN,TX,UT,VA,VT,WA,WI,WV,WY,YT,F,M,division,official,pace,overall,genderdiv,age.1
0,49.87,18.18,40.93,62.07,8.9,38.8,74.73,85.55,28,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,90.9,3.47,9,9,28
1,77.27,30.9,64.9,92.97,15.9,61.52,108.78,124.77,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,132.5,5.07,5,5,30
2,77.23,30.9,64.92,92.72,15.93,61.53,108.68,123.78,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,130.37,4.98,1,1,23
3,50.5,18.73,42.0,61.35,8.98,39.88,73.0,83.43,32,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,88.43,3.38,5,5,32
4,48.75,18.18,40.57,59.92,8.92,38.55,71.68,81.88,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,87.22,3.33,3,3,39


In [19]:
len(data.columns)

21

In [20]:
len(numeric_data.columns)

151

Our new data format has a dimension of 149.But originally it had only 21.

Our final data has high dimension and it is quite meaningless to apply dimension reduction techniques such as PCA because the numerical features are few.Therefore to use the data as is, it is better to use 'minibatchkmeans'.First thing first lets normalize our data to avoid scalling bias.

In [21]:
from sklearn.preprocessing import normalize
X_norm = normalize(numeric_data)

In [29]:
# Each batch will be made up of 200 data points.
minibatchkmeans = MiniBatchKMeans(
    init='random',
    n_clusters=3,
    batch_size=200)
minibatchkmeans.fit(numeric_data)

# Predict new MiniBatch cluster memberships.
predict_mini = minibatchkmeans.predict(numeric_data)

When it comes to Evaluating our model,unfortunately we dont have ground truth or an outcome real data to compare our model to.Therefore the best way to evaluate our model is using Similarity silhouette coefficient.Lets apply that metric to our model and interpret its performance.But first lets divide our data into 4 samples and then compare the consistency of our models performance over those 4 sample.

In [30]:
#To help us in the split process
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)

from sklearn.model_selection import train_test_split
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.
X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)
X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)


In [31]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances

for sample in [X1, X2, X3, X4]:
    model = minibatchkmeans.fit(sample)
    labels = model.labels_
    print(metrics.silhouette_score(sample, labels, metric='euclidean'))

0.5948965726028217
0.6000312063511382
0.5985404370619456
0.5943894311406339


From the Similiarity metrics in the above,We can see that the clustering is consistent across the four samples.This shows that our model's performance is really good.But when the number of clusters is greater than 4 the performance consistency starts to cramble.

In [33]:
# Each batch will be made up of 200 data points.
minibatchkmeans = MiniBatchKMeans(
    init='random',
    n_clusters=4,
    batch_size=200)
minibatchkmeans.fit(numeric_data)

# Predict new MiniBatch cluster memberships.
predict_mini = minibatchkmeans.predict(numeric_data)

In [34]:
#To help us in the split process
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)

from sklearn.model_selection import train_test_split
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.
X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)
X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)

In [35]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances

for sample in [X1, X2, X3, X4]:
    model = minibatchkmeans.fit(sample)
    labels = model.labels_
    print(metrics.silhouette_score(sample, labels, metric='euclidean'))

0.6313919990960939
0.6326635182941471
0.4585951357770301
0.6261376726213478
