In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./Data/players_no_nan.csv')
data.head(10)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,skin_colour,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards
0,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,0.125,654,247,179,228,9,19,0,0
1,aaron-hunt,Aaron Hunt,Werder Bremen,Germany,04.09.1986,183.0,73.0,Attacking Midfielder,0.125,336,141,73,122,62,42,0,1
2,aaron-lennon,Aaron Lennon,Tottenham Hotspur,England,16.04.1987,165.0,63.0,Right Midfielder,0.25,412,200,97,115,31,11,0,0
3,aaron-ramsey,Aaron Ramsey,Arsenal FC,England,26.12.1990,178.0,76.0,Center Midfielder,0.0,260,150,42,68,39,31,0,1
4,abdelhamid-el-kaoutari,Abdelhamid El-Kaoutari,Montpellier HSC,France,17.03.1990,180.0,73.0,Center Back,0.25,124,41,40,43,1,8,4,2
5,abdou-traore_2,Abdou Traoré,Girondins Bordeaux,France,17.01.1988,180.0,74.0,Right Midfielder,0.75,97,41,23,33,3,11,1,0
6,abdoulaye-diallo_2,Abdoulaye Diallo,Stade Rennes,France,30.03.1992,189.0,80.0,Goalkeeper,0.875,24,8,8,8,0,0,0,0
7,abdoulaye-keita_2,Abdoulaye Keita,Girondins Bordeaux,France,19.08.1990,188.0,83.0,Goalkeeper,0.875,3,0,1,2,0,0,0,0
8,abdoulwhaid-sissoko,Abdoulwhaid Sissoko,Stade Brest,France,20.03.1990,180.0,68.0,Defensive Midfielder,1.0,121,34,25,62,3,21,0,2
9,abdul-rahman-baba,Abdul Rahman Baba,SpVgg Greuther Fürth,Germany,02.07.1994,179.0,70.0,Left Fullback,0.875,50,17,8,25,0,3,0,1


In [20]:
print(data['leagueCountry'].unique())

['England' 'Germany' 'France' 'Spain']


In [32]:
data['english'] = [int(country == 'England') for country in data['leagueCountry']]
data['german'] = [int(country == 'Germany') for country in data['leagueCountry']]
data['french'] = [int(country == 'France') for country in data['leagueCountry']]
data['spanish'] = [int(country == 'Spain') for country in data['leagueCountry']]
data.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,skin_colour,games,...,goals,yellowCards,yellowReds,redCards,german,frensh,spanish,english,birthYear,french
0,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,0.125,654,...,9,19,0,0,0,0,0,1,1979,0
1,aaron-hunt,Aaron Hunt,Werder Bremen,Germany,04.09.1986,183.0,73.0,Attacking Midfielder,0.125,336,...,62,42,0,1,1,0,0,0,1986,0
2,aaron-lennon,Aaron Lennon,Tottenham Hotspur,England,16.04.1987,165.0,63.0,Right Midfielder,0.25,412,...,31,11,0,0,0,0,0,1,1987,0
3,aaron-ramsey,Aaron Ramsey,Arsenal FC,England,26.12.1990,178.0,76.0,Center Midfielder,0.0,260,...,39,31,0,1,0,0,0,1,1990,0
4,abdelhamid-el-kaoutari,Abdelhamid El-Kaoutari,Montpellier HSC,France,17.03.1990,180.0,73.0,Center Back,0.25,124,...,1,8,4,2,0,1,0,0,1990,1


In [26]:
data['birthYear'] = [int(b[-4:]) for b in data['birthday']]
data.head()
print(data['birthYear'][0] + data['birthYear'][1])

3965


In [27]:
data['position'].unique()

array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
       'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
       'Left Fullback', 'Left Midfielder', 'Right Fullback',
       'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)

In [59]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# features ordered in decreasing relevancy
featuresColumns = ['weight','height', 'french', 'english', 'german', 'spanish', 'redCards', 'yellowCards', 'birthYear', 'games', 'victories', 'defeats', 'ties', 'goals']
X = data[featuresColumns]

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
print(kmeans.labels_[0:20])

data['prediction'] = kmeans.labels_

print(silhouette_score(X, kmeans.labels_))

[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
0.538702020458


In [67]:
print(data.shape)
clust0 = data[kmeans.labels_ == 0]
print(clust0['skin_colour'].mean())

clust1 = data[kmeans.labels_ == 1]
print(clust1['skin_colour'].mean())

# we want the corectness to be either close to 1 (clust 1 : skin_colour 1, clust 0 : skin_colour 0)
# or close to 0 (opposite distribution)
correctness = [int(np.abs(diff) < 0.5) for diff in kmeans.labels_ - data['skin_colour']]
print(np.mean(correctness))

(1419, 24)
0.283038501561
0.282205240175
0.565186751233


In [68]:
for nbFeatures in range(len(featuresColumns)):
    X = data[featuresColumns[0:nbFeatures+1]]
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    print("silhouette score = {s}".format(s=silhouette_score(X, kmeans.labels_)))
    
    clust0 = data[kmeans.labels_ == 0]
    print(clust0['skin_colour'].mean())
    
    clust1 = data[kmeans.labels_ == 1]
    print(clust1['skin_colour'].mean())
    
    correctness = [int(np.abs(diff) < 0.5) for diff in kmeans.labels_ - data['skin_colour']]
    print(np.mean(correctness))

silhouette score = 0.5658200753175654
0.296837708831
0.26247848537
0.499647639183
silhouette score = 0.47947744173758666
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.4780667925692971
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.4763391557290047
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.4743261908992165
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.47240542366591154
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.4597309974748744
0.299406424581
0.265825035562
0.451726568006
silhouette score = 0.5145841653023195
0.284297323136
0.278485254692
0.594080338266
silhouette score = 0.5016458462400325
0.2840583174
0.279155495979
0.594080338266
silhouette score = 0.5749137417787247
0.282648283039
0.283024017467
0.567300916138
silhouette score = 0.5721897844869872
0.282820512821
0.282657657658
0.570119802678
silhouette score = 0.5603742788536803
0.283678756477
0.280837004405
0.565186751233
silho