# Dataset importation
At first, we import the dataset in csv format.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
import time

In [3]:
data = pd.read_csv('CrowdstormingDataJuly1st.csv')
data.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


## Eliminating missing data

Since some rating labels are missing we consider for the training dataset the players with both ratings.
Note that there are no players with a single rating missing.

In [4]:
missing_rater_data=data[data.rater2.isnull() & data.rater1.isnull()]
rater_data=data[data.rater2.notnull() & data.rater1.notnull()]

In [5]:
rater_data.shape

(124621, 28)

In [6]:
missing_rater_data.shape

(21407, 28)

In [7]:
data.shape

(146028, 28)

# Data analysis

### Preprocessing
 - Delete the column photoID
 - Delete all the rows corresponding to a referee with less than 22 entry (impossible - problem in the dataset)
 - Remove less significant referee entry (with nIAT and nExp too low) [preprocess_2]

In [8]:
def preprocess_1(database,remove_bad_referee=True,nIAT_threshold=50,nExp_threshold=50):
    train=database.copy()
    del train['photoID']
    del train['victories']
    del train['ties']
    del train['defeats']
    train['meanYellow']=train.yellowCards/train.games
    train['meanReds']=train.redCards/train.games
    train['meanYellowReds']=train.yellowReds/train.games

    train_group=train.groupby(train.refNum)
    
    if remove_bad_referee:
        for i,group in train_group:
            if group.shape[0]<22:
                train=train.drop(train_group.get_group(i).index)
            if i%500==0:
                print(i)
    
   
    return train

In [9]:
train_after_prep_1=preprocess_1(rater_data,remove_bad_referee=True)

500
1000
1500
2000
2500
3000


In [10]:
def preprocess_2(database,nIAT_threshold=50,nExp_threshold=50):
    train=database.copy()
    train_group=database.groupby(train.refNum)

    for i,group in train_group:
        if group.nIAT.iloc[0]<nIAT_threshold or group.nExp.iloc[0]<nExp_threshold:
            train=train.drop(train_group.get_group(i).index)
        if i%200==0:
            print(i)
    return train

In [11]:
train_after_prep_2=preprocess_2(train_after_prep_1)

1600
2400
3000


In [12]:
# train_after_prep_2['IAT_yellow']=train_after_prep_2.meanYellow*train_after_prep_2.meanIAT
# train_after_prep_2['IAT_yellowred']=train_after_prep_2.meanYellowReds*train_after_prep_2.meanIAT
# train_after_prep_2['IAT_red']=train_after_prep_2.meanReds*train_after_prep_2.meanIAT
# train_after_prep_2['Exp_yellow']=train_after_prep_2.meanYellow*train_after_prep_2.meanExp
# train_after_prep_2['Exp_yellowred']=train_after_prep_2.meanYellowReds*train_after_prep_2.meanExp
# train_after_prep_2['Exp_red']=train_after_prep_2.meanReds*train_after_prep_2.meanExp

In [13]:
train_after_prep_2.ix[:,:20].head(3)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,goals,yellowCards,yellowReds,redCards,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,0,0,0.25,0.0,4,4,LUX,0.325185,127.0
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,0,0,0,0,0.0,0.25,4,4,LUX,0.325185,127.0
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,0,0,1.0,1.0,4,4,LUX,0.325185,127.0


In [14]:
train_after_prep_2.ix[:,20:].head(3)

Unnamed: 0,seIAT,meanExp,nExp,seExp,meanYellow,meanReds,meanYellowReds
5,0.003297,0.538462,130.0,0.013752,0.0,0.0,0.0
6,0.003297,0.538462,130.0,0.013752,0.0,0.0,0.0
7,0.003297,0.538462,130.0,0.013752,0.0,0.0,0.0


In [15]:
(train_after_prep_2.nExp[train_after_prep_2.nExp<100]).value_counts()

95.0    77
69.0    37
76.0    28
Name: nExp, dtype: int64

In [16]:
min(train_after_prep_2.nExp)

69.0

# Random forest with referee information

Operations done:
 - Keep only the wanted features: ['playerShort','leagueCountry','birthday','height','weight','position','games','club','yellowCards','redCards','yellowReds','meanIAT','meanExp','rater1','rater2']
 - Fill NaN with median of the columns
 - Keep only the year of birth (not the date)
 - Add dummy variables for the categorical variables
 - Keep one entry for player
 - Extract y as mean of the rater1 and rater2

### Splitting - adding features

In [18]:
def add_mean_player(x):
    x_games=sum(x.games)
    x['meanYellowPlayer']=sum(x.yellowCards)/x_games
    x['meanRedPlayer']=sum(x.redCards)/x_games
    x['meanYellowRedPlayer']=sum(x.yellowReds)/x_games
    
    x['meanYellowPlayerIAT']=sum(x.yellowCards*x.meanIAT)/x_games
    x['meanRedPlayerIAT']=sum(x.redCards*x.meanIAT)/x_games
    x['meanYellowRedPlayerIAT']=sum(x.yellowReds*x.meanIAT)/x_games
    
    x['meanYellowPlayerExp']=sum(x.yellowCards*x.meanExp)/x_games
    x['meanRedPlayerExp']=sum(x.redCards*x.meanExp)/x_games
    x['meanYellowRedPlayerExp']=sum(x.yellowReds*x.meanExp)/x_games
    
    return x

In [19]:
def add_mean_role(x,train_database_grouped=False,is_train=True):
    if is_train:
        
        x['meanYellowRole']=sum(x.yellowCards)/sum(x.games)
        x['meanRedRole']=sum(x.redCards)/sum(x.games)
        x['meanYellowRedRole']=sum(x.yellowReds)/sum(x.games)

        x['meanYellowRoleIAT']=sum(x.yellowCards*x.meanIAT)/sum(x.games)
        x['meanRedRoleIAT']=sum(x.redCards*x.meanIAT)/sum(x.games)
        x['meanYellowRedRoleIAT']=sum(x.yellowReds*x.meanIAT)/sum(x.games)

        x['meanYellowRoleExp']=sum(x.yellowCards*x.meanExp)/sum(x.games)
        x['meanRedRoleExp']=sum(x.redCards*x.meanExp)/sum(x.games)
        x['meanYellowRedRoleExp']=sum(x.yellowReds*x.meanExp)/sum(x.games)
    else:
        position=x.position.iloc[0]
        leagueCountry=x.leagueCountry.iloc[0]
        group=train_database_grouped.loc[position]
        x['meanYellowRole']=group.loc['meanYellowRole']
        x['meanRedRole']=group.loc['meanRedRole']
        x['meanYellowRedRole']=group.loc['meanYellowRedRole']

        x['meanYellowRoleIAT']=group.loc['meanYellowRoleIAT']
        x['meanRedRoleIAT']=group.loc['meanRedRoleIAT']
        x['meanYellowRedRoleIAT']=group.loc['meanYellowRedRoleIAT']

        x['meanYellowRoleExp']=group.loc['meanYellowRoleExp']
        x['meanRedRoleExp']=group.loc['meanRedRoleExp']
        x['meanYellowRedRoleExp']=group.loc['meanYellowRedRoleExp']
    return x

In [27]:
def process_database(starting_database,train_indexes=True,test=False,binary_y=False):
    ''' Function that takes the preprocessed database and outputs train, test and y
        @ params:
            starting_database - the preprocessed database
            train_indexes - the indexes of the player to be inserted in the training 
                            (the PLAYER INDEXES, not the indexes of the full database)
            test - if considering also a test matrix
            binary_y - if considering a binary (0-1) y
                            
        @ returns:
            train
            test
            y
    '''
    
    
    number_of_players=len(starting_database.playerShort.value_counts())
    if test==False:
        train_indexes=np.range(starting_database.shape[0])
        test_indexes=[]
    else:
        small_database=starting_database.groupby('playerShort').mean()
        small_database=small_database.iloc[train_indexes]
        train_indexes=np.array(starting_database[starting_database.playerShort.isin(small_database.index)].index)
        test_indexes=np.array(starting_database.index)
        test_indexes=np.setdiff1d(test_indexes,train_indexes)
    t1=time.time()
    
    # Keep only wanted features - remove nan
    database=starting_database[['playerShort','leagueCountry','birthday','height','weight','position','games','club','yellowCards','redCards','yellowReds','meanIAT','meanExp','rater1','rater2']]
    database.is_copy=False
    database.position.fillna('nan',inplace=True)
    database.leagueCountry.fillna('nan',inplace=True)
    
    database=database.fillna(database.loc[train_indexes].median())

    # Process birthday to keep only the year
    database.birthday=database.birthday.apply(lambda x: int(x.split('.')[2]))
    
    # Add features - mean yellow cards per match of each player and the same quantity weighted with IAT and Exp
    database=database.groupby('playerShort').apply(add_mean_player)

    # Add features - mean yellow cards per role (also the weighted quantity)
    train_grouped=database.loc[train_indexes].groupby(['position']).apply(add_mean_role)
    train_grouped=train_grouped.groupby(['position']).first()
    database=database.groupby(['position']).apply(lambda x: add_mean_role(x,train_grouped,False))

    # Difference of added features 
    database['yellowDifference']=database.meanYellowPlayer-database.meanYellowRole
    database['redDifference']=database.meanRedPlayer-database.meanRedRole
    database['yellowRedDifference']=database.meanYellowRedPlayer-database.meanYellowRedRole
    
    database['yellowDifferenceIAT']=database.meanYellowPlayerIAT-database.meanYellowRoleIAT
    database['redDifferenceIAT']=database.meanRedPlayerIAT-database.meanRedRoleIAT
    database['yellowRedDifferenceIAT']=database.meanYellowRedPlayerIAT-database.meanYellowRedRoleIAT
    
    database['yellowDifferenceExp']=database.meanYellowPlayerExp-database.meanYellowRoleExp
    database['redDifferenceExp']=database.meanRedPlayerExp-database.meanRedRoleExp
    database['yellowRedDifferenceExp']=database.meanYellowRedPlayerExp-database.meanYellowRedRoleExp
    
    # Dummy variables
    dummy_variables=pd.get_dummies(database[['position','leagueCountry']])
    database=pd.concat([database,dummy_variables],axis=1)
    del database['leagueCountry']
    del database['position']
    
    # Split train e test
    train=database.loc[train_indexes]
    test=database.loc[test_indexes]
    
    # Extract one row per player
    train=train.groupby('playerShort').mean()
    test=test.groupby('playerShort').mean()
    
    # Get y and delete the columns in the train
    y_train=(train['rater1']+train['rater2'])/2
    y_test=(test['rater1']+test['rater2'])/2
    del train['rater1']
    del train['rater2']
    del test['rater1']
    del test['rater2']

    
    # y assumes only 0-1 values
    if binary_y:
        y_train=1*(y_train>0.3)
        y_test=1*(y_test>0.3)
    return train,test,y_train,y_test

### Splitting in training and test database

In [31]:
number_of_players=len(train_after_prep_2.playerShort.value_counts())

In [None]:
X_train, X_test, y_train, y_test = process_database(train_after_prep_2,train_indexes=np.random.randint(number_of_players,size=1000),test=True,binary_y=True)

In [None]:
features_to_keep=['birthday', 'height', 'weight', 
                  'meanYellowPlayer', 'meanRedPlayer',
       'meanYellowRedPlayer', 'meanYellowPlayerIAT', 'meanRedPlayerIAT',
       'meanYellowRedPlayerIAT', 'meanYellowPlayerExp', 'meanRedPlayerExp',
       'meanYellowRedPlayerExp', 'yellowDifference', 'redDifference',
       'yellowRedDifference', 'yellowDifferenceIAT', 'redDifferenceIAT',
       'yellowRedDifferenceIAT', 'yellowDifferenceExp', 'redDifferenceExp',
       'yellowRedDifferenceExp',]

In [None]:
X_train=X_train[features_to_keep]
X_test=X_test[features_to_keep]

### Classification

In [None]:
def evaluate_random_forest_model(X_train,X_test,y_train,y_test,n_estimators=10,criterion='gini',
                                max_depth=None,min_samples_split=2, min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
              bootstrap=True, oob_score=False, n_jobs=1, random_state=None, 
              verbose=0, warm_start=False, class_weight=None):
    
    rfc = RandomForestClassifier(n_estimators=n_estimators,criterion=criterion,
                                max_depth=max_depth,min_samples_split=min_samples_split, 
                                                       min_samples_leaf=min_samples_leaf, 
                                                       min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                                       max_features=max_features, max_leaf_nodes=max_leaf_nodes, 
                                                       bootstrap=bootstrap, 
                                                       oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, 
                                                       verbose=verbose, warm_start=warm_start, 
                                                       class_weight=class_weight)
    sample_weight=np.array([2.1 if i == 0 else 1 for i in y_train])
    rfc.fit(X_train,np.asarray(y_train, dtype="|S6"),sample_weight=sample_weight)
    y_out=rfc.predict(X_test)
    y_out=np.asarray(y_out,float)
    y_out=np.asarray(8*y_out,int)
    y_test=np.asarray(8*y_test,int)
    y_out_binary=1*(y_out>0.3)
    y_test_binary=1*(y_test>0.3)
#     print((y_out-y_test)[:100])
#     print(rfc.feature_importances_)
    print(metrics.confusion_matrix(y_out_binary,y_test_binary))
    return metrics.zero_one_loss(y_out_binary,y_test_binary)
#     return metrics.mean_squared_error(y_out,y_test.values)

### Overfitting
The first is the error with train and test split

The second is the error evaluated in the train database

In [None]:
X_train.head()

In [None]:
sum(y_test>0.3)/y_test.shape[0]

In [None]:
evaluate_random_forest_model(X_train,X_test,y_train,y_test)
# print(a,np.sum(a))
# (np.sum(np.diagonal(a))+np.sum(np.diagonal(a,offset=-1))+np.sum(np.diagonal(a,offset=-2))+np.sum(np.diagonal(a,offset=2))+np.sum(np.diagonal(a,offset=1)))/np.sum(a)
# (np.sum(np.diagonal(a))+np.sum(np.diagonal(a,offset=-1))+np.sum(np.diagonal(a,offset=1)))/np.sum(a)

In [None]:
evaluate_random_forest_model(X_train,X_train,y_train,y_train)

### Parameter variation to estimate overfitting

In [None]:
plt.rcParams['figure.figsize'] = (10.0, 8.0)


In [None]:
error_train=[]
error_test=[]
for i in range(1,20):
    error_test.append(evaluate_random_forest_model(X_train,X_test,y_train,y_test,max_depth=i))
    error_train.append(evaluate_random_forest_model(X_train,X_train,y_train,y_train,max_depth=i))


max_depth

In [None]:
plt.plot(range(1,20),error_test)
plt.plot(range(1,20),error_train)
plt.legend(['test_error','train_error'])
plt.title('Error with different max_depth')
plt.xlabel('max_depth')
plt.ylabel('mean square error')

n_estimators

In [None]:
error_train=[]
error_test=[]
for i in range(1,40):
    error_test.append(evaluate_random_forest_model(X_train,X_test,y_train,y_test,n_estimators=i))
    error_train.append(evaluate_random_forest_model(X_train,X_train,y_train,y_train,n_estimators=i))


In [None]:
plt.plot(range(1,40),error_test)
plt.plot(range(1,40),error_train)
plt.legend(['test_error','train_error'])
plt.title('Error with different n_estimators')
plt.xlabel('n_estimators')
plt.ylabel('mean square error')

criterion

In [None]:
error_train=[]
error_test=[]

error_test.append(evaluate_random_forest_model(X_train,X_test,y_train,y_test,criterion='gini'))
error_train.append(evaluate_random_forest_model(X_train,X_train,y_train,y_train,criterion='gini'))

error_test.append(evaluate_random_forest_model(X_train,X_test,y_train,y_test,criterion='entropy'))
error_train.append(evaluate_random_forest_model(X_train,X_train,y_train,y_train,criterion='entropy'))

In [None]:
error_train

In [None]:
error_test

### Cross validation

With cross validation we expect more precise results.
We chosed n_folds=30.

In [None]:
kf = KFold(train.shape[0], n_folds=10, shuffle=True)

In [None]:
def cross_val(database,n_estimators=10,criterion='gini',
                                max_depth=None,min_samples_split=2, min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
              bootstrap=True, oob_score=False, n_jobs=1, random_state=None, 
              verbose=0, warm_start=False, class_weight=None):
    
    error_train=[]
    error_test=[]
    for iteration, data in enumerate(kf, start=1):
        print('Iteration:',iteration)
#         X_train=database.iloc[data[0]]
#         X_test=database.iloc[data[1]]
#         y_train=output.iloc[data[0]]
#         y_test=output.iloc[data[1]]

        X_train,X_test,y_train,y_test=process_database(database,train_indexes=data[0],test=True,binary_y=True)
        X_train=X_train[features_to_keep]
        X_test=X_test[features_to_keep]
        error_test.append(evaluate_random_forest_model(X_train,X_test,y_train,y_test,
                                                       n_estimators=n_estimators,criterion=criterion,
                                                       max_depth=max_depth,min_samples_split=min_samples_split, 
                                                       min_samples_leaf=min_samples_leaf, 
                                                       min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                                       max_features=max_features, max_leaf_nodes=max_leaf_nodes, 
                                                       bootstrap=bootstrap, 
                                                       oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, 
                                                       verbose=verbose, warm_start=warm_start, 
                                                       class_weight=class_weight))
        error_train.append(evaluate_random_forest_model(X_train,X_train,y_train,y_train,
                                                        n_estimators=n_estimators,criterion=criterion,
                                                       max_depth=max_depth,min_samples_split=min_samples_split, 
                                                       min_samples_leaf=min_samples_leaf, 
                                                       min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                                       max_features=max_features, max_leaf_nodes=max_leaf_nodes, 
                                                       bootstrap=bootstrap, 
                                                       oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, 
                                                       verbose=verbose, warm_start=warm_start, 
                                                       class_weight=class_weight))
    return np.mean(error_train),np.mean(error_test)


### Best parameters so far

In [None]:
sum(y_test==1)/len(y_test)

In [None]:
cross_val(train_after_prep_2,min_samples_leaf=2)

### Training parameters

max_leaf_nodes

In [None]:
error_train=[]
error_test=[]
# for i in np.linspace(0,0.5,40):
for i in range(2,41):
    error_train_,error_test_=cross_val(train,y_ref,max_depth=i,min_samples_leaf=20)
    error_train.append(error_train_)
    error_test.append(error_test_)
    if i%10==0:
        print(i)


In [None]:
plt.plot(range(1,40),error_test)
plt.plot(range(1,40),error_train)
plt.legend(['test_error','train_error'])
plt.title('Error with different max_depth')
plt.xlabel('max_depth')
plt.ylabel('mean square error')

n_estimators

In [None]:
error_train=[]
error_test=[]
for i in range(1,40):
    error_train_,error_test_=cross_val(train,y_ref,n_estimators=i)
    error_train.append(error_train_)
    error_test.append(error_test_)
    if i%10==0:
        print(i)


In [None]:
plt.plot(range(1,40),error_test)
plt.plot(range(1,40),error_train)
plt.legend(['test_error','train_error'])
plt.title('Error with different n_estimators')
plt.xlabel('n_estimators')
plt.ylabel('mean square error')

criterion

In [None]:
error_train=[]
error_test=[]

error_train_,error_test_=cross_val(train_trivial,y_trivial,criterion='gini')

error_train.append(error_train_)
error_test.append(error_test_)

error_train_,error_test_=cross_val(train_trivial,y_trivial,criterion='entropy')

error_train.append(error_train_)
error_test.append(error_test_)

In [None]:
error_train

In [None]:
error_test