# Homework 04 - Applied ML

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns  
import matplotlib.pyplot as plt  
from scipy.stats import skew, skewtest, describe
from sklearn import preprocessing
%matplotlib inline

## 1. Predict the skin color of a soccer player

In this first task we train a *Random forest* classifier to be able to predict the skin color of a soccer player. In order to do so, we proceed pre-processing the data as first step then moving toward the choice of the model (interpret as the choice of parameters controlling the possible issues i.e. the *overfitting*). As required, we then switch to the inspection of the `feature_importances_` attribute and the discussion of the obtained results.

### 1.1 Exploratory Data Analysis, Feature Selection and Feature engineering

In [2]:
# Import data 
data = pd.read_csv('CrowdstormingDataJuly1st.csv', sep = ',')

In [3]:
data.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


In [4]:
data.columns

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp'],
      dtype='object')

##### First clean of data
According to the given information in the [data description](https://github.com/ADAEPFL/Homework/blob/master/04%20-%20Applied%20ML/DATA.md), we get rid off all the dyads that correspond to players whose picture is not available.

In [5]:
data_clean = data[(data.photoID.notnull())]

##### Have a glance at the labels

Thus, we check whether happens that one of the two raters do not assign the label. We see that both of them do their job. 

In [6]:
# How many players the rater 1 don't label?
miss_rater_1 = sum(data_clean.rater1.isnull())
# How many the rater 2?
miss_rater_2 = sum(data_clean.rater2.isnull())

print ('Rater 1 does not label', miss_rater_1, 'players')
print ('Rater 2 does not label', miss_rater_2, 'players')

Rater 1 does not label 0 players
Rater 2 does not label 0 players


We study the distribution of the labels, even to verify disagreements between the two raters. The procedure consist of:
- Grouping by the `playerShort`
- Get the given labels
- Plot their distribution using a *simple* barplot

In [7]:
# Drop dyads weigth and height
data_clean = data_clean.dropna(axis=0, subset=['height', 'weight'])

In [8]:
player_data = data_clean.groupby('playerShort')

Verify that all the players only belong to one club

In [9]:
player_data.agg({'club' : lambda x: len(set(x))})['club'].unique()

array([1])

In [10]:
player_data.agg({'position' : lambda x: len(set(x))})['position'].unique()

array([1])

In [11]:
players = player_data.agg({
        'club' : 'first',
        'leagueCountry' : 'first',
        'birthday' : 'first',
        'height' : 'first',
        'weight' : 'first',
        'position' : 'first',
        'games' : 'sum',
        'victories' : 'sum',
        'ties' : 'sum',
        'defeats' : 'sum',
        'goals' : 'sum',
        'yellowCards': 'sum',
        'yellowReds': 'sum',
        'redCards' : 'sum',
        'rater1' : 'mean',
        'rater2' : 'mean',
        #'refNum' : 'count',
        #'refCountry' : 'count',
        #'meanIAT' : 'mean',
        #'meanExp' : 'mean'
        
    })

In [12]:
label_1 = players['rater1']

In [13]:
label_2 = players['rater2']

In [14]:
def binary_labels(x):
    if x <= 0.5:
        return 0
    
    else:
        return 1

In [None]:
def preprocess_labels(label):
    le = preprocessing.LabelEncoder()
    le.fit(label)
    label = le.transform(label) 
    return label

In [15]:
label_1 = label_1.apply(binary_labels)
label_2 = label_2.apply(binary_labels)

In [None]:
label_1 = pd.Series(preprocess_labels(label_1))
label_2 = pd.Series(preprocess_labels(label_2))

In [16]:
players.drop('rater1', axis = 1, inplace = True)

In [17]:
players.drop('rater2', axis= 1, inplace = True)

### Baseline model

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
from sklearn.utils import shuffle

In [20]:
from sklearn.multioutput import MultiOutputClassifier

#### Preprocess variable to be used as input for the classifier

In [21]:
players['birthday'] = players['birthday'].apply(lambda x: float(x.split('.')[-1]))

In [22]:
def encode_string_variable(df, attribute):
    
    df[attribute] = df[attribute].fillna('Unknown')
    
    le = preprocessing.LabelEncoder()
    
    le.fit(df[attribute])
    
    df[attribute] = le.transform(df[attribute]) 

In [23]:
# Get the string variables
object_features = [i for i in players.columns if players[i].dtypes == 'object']
numerical_features = [i for i in players.columns if (players[i].dtypes == 'int64' or players[i].dtypes == 'float64') and len(players[i].unique()) > 12]

In [24]:
for feature in object_features:
    encode_string_variable(players, feature)

##### Categorise features

In [25]:
numerical_features

['yellowCards',
 'height',
 'birthday',
 'games',
 'weight',
 'ties',
 'goals',
 'victories',
 'defeats']

In [26]:
from functools import partial

In [27]:
def create_bins(df, attribute):
    # Get the whiskers values
    B = plt.boxplot(df[attribute])
    plt.close()
    min_max = [item.get_ydata()[1] for item in B['whiskers']]

    # Compute the Skew-test
    skew_pvalue = skewtest(df[attribute][df[attribute] >= min_max[0]])[1]
    
    if skew_pvalue < 0.05:
        bins = np.histogram(df[attribute], bins = 'doane')[1]
        bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)]
    else:
        bins = np.histogram(players[attribute], bins = 'auto')[1]
        bins_interval = [(bins[i], bins[i+1]) for i in range(len(bins)-1)]
    
    return bins_interval

In [28]:
def categorisation(bins_intervals,x):
    
    classes = range(len(bins_intervals))
    for i in classes:
        if  bins_intervals[i][0] <= x < bins_intervals[i][1]:
            return classes[i]
    
    return classes[-1]   

In [29]:
for i in range(len(numerical_features)):
    players[numerical_features[i]] = players[numerical_features[i]].apply(partial(categorisation, create_bins(players, numerical_features[i])))

In [30]:
players

Unnamed: 0_level_0,position,yellowCards,height,birthday,leagueCountry,games,weight,yellowReds,ties,goals,victories,defeats,redCards,club
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
aaron-hughes,1,1,14,4,0,11,5,0,14,0,7,14,0,33
aaron-hunt,0,3,14,8,2,5,5,0,5,2,4,7,1,90
aaron-lennon,10,0,2,9,0,7,2,0,7,1,6,7,0,82
aaron-ramsey,3,2,11,11,0,4,6,0,3,1,4,4,1,6
abdelhamid-el-kaoutari,1,0,12,11,1,2,5,4,3,0,1,2,2,50
abdou-traore_2,10,0,12,9,1,1,6,1,1,0,1,2,0,35
abdoulaye-diallo_2,5,0,18,12,1,0,7,0,0,0,0,0,0,79
abdoulaye-keita_2,5,0,18,11,1,0,8,0,0,0,0,0,0,35
abdoulwhaid-sissoko,4,1,12,11,1,2,4,0,2,0,1,3,2,77
abdul-rahman-baba,6,0,12,13,2,0,4,0,0,0,0,1,1,76


In [None]:
#describe(players[numerical_features[1]])

In [None]:
#np.histogram(players[numerical_features[1]], bins = 'doane')

In [None]:
#describe(players[numerical_features[2]])

In [None]:
#np.histogram(players[numerical_features[2]], bins = 'doane')

In [None]:
#describe(players[numerical_features[3]])

In [None]:
#np.histogram(players[numerical_features[3]], bins = 'doane')

In [None]:
#describe(players[numerical_features[4]])

In [None]:
#np.histogram(players[numerical_features[4]], bins = 'doane')

In [None]:
#describe(players[numerical_features[5]])

In [None]:
#np.histogram(players[numerical_features[5]], bins = 'doane')

In [None]:
#describe(players[numerical_features[6]])

In [None]:
#np.histogram(players[numerical_features[6]], bins = 'doane')

In [None]:
#describe(players[numerical_features[7]])

In [None]:
#np.histogram(players[numerical_features[7]], bins = 'auto')

In [None]:
#describe(players[numerical_features[8]])

In [None]:
#np.histogram(players[numerical_features[8]], bins = 'doane')

In [None]:
#describe(players[numerical_features[9]])

In [None]:
#np.histogram(players[numerical_features[9]], bins = 'doane')

In [None]:
#describe(players[numerical_features[10]])

In [None]:
#np.histogram(players[numerical_features[10]], bins = 'doane')

In [None]:
#describe(players[numerical_features[11]])

In [None]:
#np.histogram(players[numerical_features[11]], bins = 'doane')

#### Split train and test

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
X_train, X_test, y_train, y_test = train_test_split(players, label_1, test_size=0.33, random_state=42)

In [113]:
weight_class = y_train.value_counts()/len(y_train)

In [114]:
weight_class

0    0.850048
1    0.149952
Name: rater1, dtype: float64

In [115]:
sample_weights = []
for i in y_train:
    sample_weights += [weight_class[i]]

In [116]:
y_train_2, y_test_2 = label_2[y_train.index], label_2[y_test.index]

In [117]:
forest = RandomForestClassifier(n_estimators=100, random_state=1, class_weight='balanced')

In [118]:
train_forest = forest.fit(X_train, y_train, sample_weight= sample_weights)

In [119]:
a = train_forest.predict(X_test)

In [120]:
train_forest.score(X_test, y_test)

0.84139264990328821

In [121]:
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [122]:
multi_label =  np.array([ y_train, y_train_2]).T
multi_label_test = np.array([ y_test, y_test_2]).T

In [123]:
multi_target_forest.fit(X_train, multi_label, sample_weight= sample_weights).score(X_test, multi_label_test)

0.82011605415860733

In [None]:
classifier_1 = np.array(classifier_1)

In [None]:
for i in range(5):
    print ('TEST', len(y_test[y_test == i]), 'class', i)
    print ('PREDICTOR', len(classifier_1[classifier_1 == i]), 'class', i)
    print ('*'*20)

In [None]:
len(classifier_1)

In [None]:
len(y_test[y_test == 0])

In [None]:
len(classifier_1[classifier_1 == 0])

In [None]:
multi_target_forest.fit(X_train, multi_label).score(X_test, multi_label_test)

In [None]:
sum(y_test == classifier_1)/len(y_test)

In [None]:
classifier_1

In [None]:
y_test