In [12]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet,Lars
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path. We will try to estimate the potential based on a select number of features from the data 
</div>

In [19]:
path_of_input_file = r'D:\kaggle_trials\fifa19\data.csv'
df                 = pd.read_csv(path_of_input_file,)
df                 = df[['Preferred Foot','Age','International Reputation','Weak Foot','Skill Moves',
                         'Work Rate','Body Type','Position','Height','Crossing','Finishing','HeadingAccuracy',
                         'ShortPassing','Volleys','Dribbling','Curve','FKAccuracy','LongPassing','BallControl',
                         'Acceleration','SprintSpeed','Agility','Reactions','Balance','ShotPower','Jumping','Potential'
                        ]]
df.head(5)

Unnamed: 0,Preferred Foot,Age,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Position,Height,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Potential
0,Left,31,5.0,4.0,4.0,Medium/ Medium,Messi,RF,5'7,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,94
1,Right,33,5.0,4.0,5.0,High/ Low,C. Ronaldo,ST,6'2,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,94
2,Right,26,5.0,5.0,5.0,High/ Medium,Neymar,LW,5'9,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,93
3,Right,27,4.0,3.0,1.0,Medium/ Medium,Lean,GK,6'4,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,93
4,Right,27,4.0,5.0,4.0,High/ High,Normal,RCM,5'11,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,92


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [21]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('Potential')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.
</div>

In [22]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations
</div>

In [23]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['Potential']])


<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use
</div>

In [25]:
lars_grid  = {
                     'precompute':hp.choice('precompute',[True,False])
                     }

In [26]:
def hyperopt_train_test(params):
    reg = ElasticNet(**params,random_state=19)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, lars_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(lars_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|███████████████████████████████████████████████| 30/30 [00:02<00:00, 14.91it/s, best loss: 0.00037244777701158266]
The best parameter tuned on training set is given by :-  {'precompute': False}


<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [33]:
model = Lars(**best_parameters)
model.fit(X_train, y_train)

Lars(copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True,
     n_nonzero_coefs=500, normalize=True, positive=False, precompute=False,
     verbose=False)

In [34]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  -0.0005494505494541002


<div class="alert alert-block alert-info">
<b>Parameter Grid for Extra Trees:</b> We set up the parameter grid for Extra trees classifier and then tune it to get the results needed
</div>

In [35]:
extra_trees_grid  = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
                      'min_samples_split' : hp.uniform('min_samples_split',0.01,0.95),
                      'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
                      'max_features'      : hp.choice('max_features',['auto','sqrt','log2',None])
                     }

In [36]:
def hyperopt_train_test(params):
    reg = ExtraTreesRegressor(**params,random_state=11)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, extra_trees_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(extra_trees_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:32<00:00,  1.72s/it, best loss: -0.7556171670409636]
The best parameter tuned on training set is given by :-  {'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 0.01046857820960289, 'n_estimators': 13}


In [39]:
et_regressor = ExtraTreesRegressor(**best_parameters,random_state=11)
et_regressor.fit(X_train,y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=0.01046857820960289,
                    min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=None,
                    oob_score=False, random_state=11, verbose=0,
                    warm_start=False)

In [40]:
y_pred = et_regressor.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.5875676443712222


<div class="alert alert-block alert-info">
<b>Conclusion:</b> The Extra trees classifier gave pretty better results as compared to LARS. 
</div>