In [16]:
#Importing Libraries
import pandas as pd

# For Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To Scale our data
from sklearn.preprocessing import scale

# To perform KMeans clustering 
from sklearn.cluster import KMeans

# To perform Hierarchical clustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [17]:
#reading Dataset
retail = pd.read_csv("Cricket.csv",  sep = ',',encoding = "ISO-8859-1", header= 0)


In [18]:
# Let's look top 5 rows
retail.head()

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0
0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20
1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15
2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20
3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34
4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28


In [19]:
retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 13 columns):
Player    79 non-null object
Span      79 non-null object
Mat       79 non-null int64
Inns      79 non-null int64
NO        79 non-null int64
Runs      79 non-null int64
HS        79 non-null object
Ave       79 non-null float64
BF        79 non-null int64
SR        79 non-null float64
100       79 non-null int64
50        79 non-null int64
0         79 non-null int64
dtypes: float64(2), int64(8), object(3)
memory usage: 8.1+ KB


In [20]:
cric = retail.loc[:,['Player','SR','Ave']]
cric.head()

Unnamed: 0,Player,SR,Ave
0,SR Tendulkar (INDIA),86.23,44.83
1,KC Sangakkara (Asia/ICC/SL),78.86,41.98
2,RT Ponting (AUS/ICC),80.39,42.03
3,ST Jayasuriya (Asia/SL),91.2,32.36
4,DPMD Jayawardene (Asia/SL),78.96,33.37


In [21]:
# standardise all parameters
RFM_norm1 = cric.drop("Player", axis=1)

from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
RFM_norm1 = standard_scaler.fit_transform(RFM_norm1)

In [22]:
RFM_norm1 = pd.DataFrame(RFM_norm1)
RFM_norm1.columns = ['SR','Ave']
RFM_norm1.head()

Unnamed: 0,SR,Ave
0,0.703152,1.072294
1,-0.044139,0.587725
2,0.110997,0.596226
3,1.207091,-1.047909
4,-0.034,-0.876185


In [23]:
# Kmeans with K=5
model_clus5 = KMeans(n_clusters = 4, random_state=100)
model_clus5.fit(RFM_norm1)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=100, tol=0.0001, verbose=0)

In [24]:
# analysis of clusters formed
cric.index = pd.RangeIndex(len(cric.index))
RFM_km = pd.concat([cric, pd.Series(model_clus5.labels_)], axis=1)
RFM_km.columns = ['Player', 'SR', 'Ave', 'ClusterID']
RFM_km

Unnamed: 0,Player,SR,Ave,ClusterID
0,SR Tendulkar (INDIA),86.23,44.83,3
1,KC Sangakkara (Asia/ICC/SL),78.86,41.98,1
2,RT Ponting (AUS/ICC),80.39,42.03,1
3,ST Jayasuriya (Asia/SL),91.20,32.36,0
4,DPMD Jayawardene (Asia/SL),78.96,33.37,2
5,Inzamam-ul-Haq (Asia/PAK),74.24,39.52,1
6,JH Kallis (Afr/ICC/SA),72.89,44.36,1
7,SC Ganguly (Asia/INDIA),73.70,41.02,1
8,R Dravid (Asia/ICC/INDIA),71.24,39.16,1
9,BC Lara (ICC/WI),79.51,40.48,1


In [25]:
km_clusters_SR = 	pd.DataFrame(RFM_km.groupby(["ClusterID"]).SR.mean())
km_clusters_Ave = 	pd.DataFrame(RFM_km.groupby(["ClusterID"]).Ave.mean())
df = pd.concat([pd.Series([0,1,2,3]), km_clusters_SR, km_clusters_Ave], axis=1)
df.columns = ["ClusterID", "SR_mean", "Ave.mean"]
df.head()


Unnamed: 0,ClusterID,SR_mean,Ave.mean
0,0,94.175833,35.409167
1,1,75.751852,41.484444
2,2,74.014839,33.949355
3,3,88.273333,49.546667


In [26]:
(high SR, high Ave) - A, (low SR, low Ave) - B, (High SR, Low Ave) - C, (Low SR, High Ave) - D

A - 3
B - 2
C- 0
D - 1


SyntaxError: invalid syntax (<ipython-input-26-57211c7add9a>, line 1)

In [30]:
ID3 = RFM_km.loc[RFM_km.ClusterID == 3]
ID2 = RFM_km.loc[RFM_km.ClusterID == 2]
ID1 = RFM_km.loc[RFM_km.ClusterID == 1]
ID0 = RFM_km.loc[RFM_km.ClusterID == 0]
ID3

Unnamed: 0,Player,SR,Ave,ClusterID
0,SR Tendulkar (INDIA),86.23,44.83,3
13,MS Dhoni (Asia/INDIA),88.69,51.32,3
15,AB de Villiers (Afr/SA),100.25,53.55,3
25,V Kohli (INDIA),90.99,53.94,3
34,HM Amla (SA),89.05,50.25,3
38,MG Bevan (AUS),74.16,53.58,3
42,IVA Richards (WI),90.2,47.0,3
63,MJ Guptill (NZ),87.73,43.3,3
64,MEK Hussey (AUS),87.16,48.15,3


In [31]:
ID2

Unnamed: 0,Player,SR,Ave,ClusterID
4,DPMD Jayawardene (Asia/SL),78.96,33.37,2
14,M Azharuddin (INDIA),74.02,36.92,2
16,PA de Silva (SL),81.13,34.9,2
22,MS Atapattu (SL),67.72,37.57,2
26,HH Gibbs (SA),83.26,36.13,2
28,SP Fleming (ICC/NZ),71.49,32.4,2
30,SR Waugh (AUS),75.91,32.9,2
31,A Ranatunga (SL),77.9,35.84,2
33,Younis Khan (PAK),75.29,31.24,2
35,Saleem Malik (PAK),76.41,32.88,2


In [32]:
ID1

Unnamed: 0,Player,SR,Ave,ClusterID
1,KC Sangakkara (Asia/ICC/SL),78.86,41.98,1
2,RT Ponting (AUS/ICC),80.39,42.03,1
5,Inzamam-ul-Haq (Asia/PAK),74.24,39.52,1
6,JH Kallis (Afr/ICC/SA),72.89,44.36,1
7,SC Ganguly (Asia/INDIA),73.7,41.02,1
8,R Dravid (Asia/ICC/INDIA),71.24,39.16,1
9,BC Lara (ICC/WI),79.51,40.48,1
11,Mohammad Yousuf (Asia/PAK),75.1,41.71,1
18,Saeed Anwar (PAK),80.67,39.21,1
19,S Chanderpaul (WI),70.74,41.6,1


In [33]:
ID0

Unnamed: 0,Player,SR,Ave,ClusterID
3,ST Jayasuriya (Asia/SL),91.2,32.36,0
10,TM Dilshan (SL),86.23,39.27,0
12,AC Gilchrist (AUS/ICC),96.94,35.89,0
17,CH Gayle (ICC/WI),85.11,37.33,0
20,Yuvraj Singh (Asia/INDIA),87.67,36.55,0
24,V Sehwag (Asia/ICC/INDIA),104.33,35.05,0
27,Shahid Afridi (Asia/ICC/PAK),117.0,23.57,0
50,BB McCullum (NZ),96.37,30.41,0
57,EJG Morgan (ENG/IRE),88.62,38.73,0
59,SR Watson (AUS),90.44,40.54,0
