In [15]:
import numpy as np
import pandas as pd

In [16]:
data = pd.read_csv('./Data/players_no_nan.csv')
data.head(10)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,skin_colour,games,...,defeats,goals,yellowCards,yellowReds,redCards,english,german,french,spanish,birthYear
0,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,0.125,654,...,228,9,19,0,0,1,0,0,0,1979
1,aaron-hunt,Aaron Hunt,Werder Bremen,Germany,04.09.1986,183.0,73.0,Attacking Midfielder,0.125,336,...,122,62,42,0,1,0,1,0,0,1986
2,aaron-lennon,Aaron Lennon,Tottenham Hotspur,England,16.04.1987,165.0,63.0,Right Midfielder,0.25,412,...,115,31,11,0,0,1,0,0,0,1987
3,aaron-ramsey,Aaron Ramsey,Arsenal FC,England,26.12.1990,178.0,76.0,Center Midfielder,0.0,260,...,68,39,31,0,1,1,0,0,0,1990
4,abdelhamid-el-kaoutari,Abdelhamid El-Kaoutari,Montpellier HSC,France,17.03.1990,180.0,73.0,Center Back,0.25,124,...,43,1,8,4,2,0,0,1,0,1990
5,abdou-traore_2,Abdou Traoré,Girondins Bordeaux,France,17.01.1988,180.0,74.0,Right Midfielder,0.75,97,...,33,3,11,1,0,0,0,1,0,1988
6,abdoulaye-diallo_2,Abdoulaye Diallo,Stade Rennes,France,30.03.1992,189.0,80.0,Goalkeeper,0.875,24,...,8,0,0,0,0,0,0,1,0,1992
7,abdoulaye-keita_2,Abdoulaye Keita,Girondins Bordeaux,France,19.08.1990,188.0,83.0,Goalkeeper,0.875,3,...,2,0,0,0,0,0,0,1,0,1990
8,abdoulwhaid-sissoko,Abdoulwhaid Sissoko,Stade Brest,France,20.03.1990,180.0,68.0,Defensive Midfielder,1.0,121,...,62,3,21,0,2,0,0,1,0,1990
9,abdul-rahman-baba,Abdul Rahman Baba,SpVgg Greuther Fürth,Germany,02.07.1994,179.0,70.0,Left Fullback,0.875,50,...,25,0,3,0,1,0,1,0,0,1994


In [17]:
print(data['leagueCountry'].unique())

['England' 'Germany' 'France' 'Spain']


In [18]:
data['english'] = [int(country == 'England') for country in data['leagueCountry']]
data['german'] = [int(country == 'Germany') for country in data['leagueCountry']]
data['french'] = [int(country == 'France') for country in data['leagueCountry']]
data['spanish'] = [int(country == 'Spain') for country in data['leagueCountry']]
data.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,skin_colour,games,...,defeats,goals,yellowCards,yellowReds,redCards,english,german,french,spanish,birthYear
0,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,0.125,654,...,228,9,19,0,0,1,0,0,0,1979
1,aaron-hunt,Aaron Hunt,Werder Bremen,Germany,04.09.1986,183.0,73.0,Attacking Midfielder,0.125,336,...,122,62,42,0,1,0,1,0,0,1986
2,aaron-lennon,Aaron Lennon,Tottenham Hotspur,England,16.04.1987,165.0,63.0,Right Midfielder,0.25,412,...,115,31,11,0,0,1,0,0,0,1987
3,aaron-ramsey,Aaron Ramsey,Arsenal FC,England,26.12.1990,178.0,76.0,Center Midfielder,0.0,260,...,68,39,31,0,1,1,0,0,0,1990
4,abdelhamid-el-kaoutari,Abdelhamid El-Kaoutari,Montpellier HSC,France,17.03.1990,180.0,73.0,Center Back,0.25,124,...,43,1,8,4,2,0,0,1,0,1990


In [19]:
data['birthYear'] = [int(b[-4:]) for b in data['birthday']]
data.head()
print(data['birthYear'][0] + data['birthYear'][1])

3965


In [20]:
data['position'].unique()

array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
       'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
       'Left Fullback', 'Left Midfielder', 'Right Fullback',
       'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)

In [36]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# features ordered in decreasing relevancy
featuresColumns = ['weight','height', 'french', 'english', 'german', 'spanish', 'redCards', 'yellowCards', 'birthYear', 'games', 'victories', 'defeats', 'ties', 'goals']
X = data[featuresColumns]
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
print(kmeans.labels_[0:20])

data['prediction'] = kmeans.labels_

print(silhouette_score(X, kmeans.labels_))

[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
0.538702020458


In [22]:
print(data.shape)
clust0 = data[kmeans.labels_ == 0]
print(clust0['skin_colour'].mean())

clust1 = data[kmeans.labels_ == 1]
print(clust1['skin_colour'].mean())

# we want the corectness to be either close to 1 (clust 1 : skin_colour 1, clust 0 : skin_colour 0)
# or close to 0 (opposite distribution)
correctness = [int(np.abs(diff) < 0.5) for diff in kmeans.labels_ - data['skin_colour']]
print(np.mean(correctness))

(1419, 23)
0.28303850156087407
0.28220524017467247
0.565186751233


In [9]:
for nbFeatures in range(len(featuresColumns)):
    X = data[featuresColumns[0:nbFeatures+1]]
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    print("silhouette score = {s}".format(s=silhouette_score(X, kmeans.labels_)))
    
    clust0 = data[kmeans.labels_ == 0]
    print(clust0['skin_colour'].mean())
    
    clust1 = data[kmeans.labels_ == 1]
    print(clust1['skin_colour'].mean())
    
    correctness = [int(np.abs(diff) < 0.5) for diff in kmeans.labels_ - data['skin_colour']]
    print(np.mean(correctness))

silhouette score = 0.5658200753175654
0.29683770883054894
0.26247848537005164
0.499647639183
silhouette score = 0.47901646817507826
0.30126498002663116
0.2619760479041916
0.4637068358
silhouette score = 0.47806679256929685
0.2994064245810056
0.26582503556187764
0.451726568006
silhouette score = 0.47633915572900454
0.2994064245810056
0.26582503556187764
0.451726568006
silhouette score = 0.47432619089921635
0.2994064245810056
0.26582503556187764
0.451726568006
silhouette score = 0.47240542366591143
0.2994064245810056
0.26582503556187764
0.451726568006
silhouette score = 0.45973099747487417
0.2994064245810056
0.26582503556187764
0.451726568006
silhouette score = 0.5169449952091852
0.28474857685009486
0.27705479452054793
0.596194503171
silhouette score = 0.5016458462400324
0.28405831739961757
0.27915549597855227
0.594080338266
silhouette score = 0.5749137417787247
0.28264828303850154
0.2830240174672489
0.567300916138
silhouette score = 0.5721897844869872
0.28282051282051285
0.2826576576576