In [28]:
import random

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import cross_val_score

In [29]:
# obtains data, gets the head of the data
data = pd.read_csv("../data/data_cleaned.csv")
data

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred Foot,International Reputation,...,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,Loaned Out,Off Work Rate,Def Work Rate
0,31,Argentina,94,94,FC Barcelona,110500000.0,565000.0,2202,Left,5.0,...,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0,False,Medium,Medium
1,33,Portugal,94,94,Juventus,77000000.0,405000.0,2228,Right,5.0,...,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0,False,High,Low
2,26,Brazil,92,93,Paris Saint-Germain,118500000.0,290000.0,2143,Right,5.0,...,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0,False,High,Medium
3,27,Spain,91,93,Manchester United,72000000.0,260000.0,1471,Right,4.0,...,13.0,90.0,85.0,87.0,88.0,94.0,138600000.0,False,Medium,Medium
4,27,Belgium,91,92,Manchester City,102000000.0,355000.0,2281,Right,4.0,...,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0,False,High,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18202,19,England,47,65,Crewe Alexandra,60000.0,1000.0,1307,Right,1.0,...,47.0,10.0,13.0,7.0,8.0,9.0,143000.0,False,Medium,Medium
18203,19,Sweden,47,63,Trelleborgs FF,60000.0,1000.0,1098,Right,1.0,...,19.0,10.0,9.0,9.0,5.0,12.0,113000.0,False,Medium,Medium
18204,16,England,47,67,Cambridge United,60000.0,1000.0,1189,Right,1.0,...,11.0,6.0,5.0,10.0,6.0,13.0,165000.0,False,Medium,Medium
18205,17,England,47,66,Tranmere Rovers,60000.0,1000.0,1228,Right,1.0,...,27.0,14.0,6.0,14.0,8.0,9.0,143000.0,False,Medium,Medium


In [30]:
# Deals with players with value of 0. Can't be predicted or trained on
data = data.loc[~(data['Value']==0),]

In [31]:
data.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred Foot,International Reputation,...,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,Loaned Out,Off Work Rate,Def Work Rate
0,31,Argentina,94,94,FC Barcelona,110500000.0,565000.0,2202,Left,5.0,...,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0,False,Medium,Medium
1,33,Portugal,94,94,Juventus,77000000.0,405000.0,2228,Right,5.0,...,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0,False,High,Low
2,26,Brazil,92,93,Paris Saint-Germain,118500000.0,290000.0,2143,Right,5.0,...,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0,False,High,Medium
3,27,Spain,91,93,Manchester United,72000000.0,260000.0,1471,Right,4.0,...,13.0,90.0,85.0,87.0,88.0,94.0,138600000.0,False,Medium,Medium
4,27,Belgium,91,92,Manchester City,102000000.0,355000.0,2281,Right,4.0,...,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0,False,High,High


In [32]:
# Deals with missing values (Will delete for now, can change later)
data = data.loc[(~data['Preferred Foot'].isna())]

In [33]:
# Converts nationality, club, preferred foot, position, real face, body type, loaded out
# Off_Work_Rate and Def_Work_Rate to numeric categories
cate = ["Nationality", "Club", "Preferred Foot", "Body Type", "Real Face", "Position"
        , "Loaned Out", "Off Work Rate", "Def Work Rate"]
def cate_to_numeric(cate):
    data[cate] = data[cate].astype('category')
    data[cate] = data[cate].cat.codes

for cat in cate:
    cate_to_numeric(cat)

In [34]:
# Deals with players with no release clause
data.loc[data['Release Clause'].isna(),'no_rlse_clause'] = 1
data.loc[~data['Release Clause'].isna(),'no_rlse_clause'] = 0
data.loc[data['Release Clause'].isna(),'Release Clause'] = 0 

In [35]:
# Deals with goalkeepers who have no statistics for variables from LS->RB
positions = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
'RAM', 'LM', 'LCM' ,'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM',
'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']

data.loc[data['LS'].isna(),'is_gk'] = 1
data.loc[~data['LS'].isna(),'is_gk'] = 0

for pos in positions:
    data.loc[data[pos].isna(),pos] = 0

In [36]:
# splits into training, testing, and validation data sets
y = data.pop('Value')
X = data

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)
X_train.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Wage,Special,Preferred Foot,International Reputation,Weak Foot,...,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,Loaned Out,Off Work Rate,Def Work Rate,no_rlse_clause,is_gk
2782,32,138,73,73,366,8000.0,1875,1,1.0,4.0,...,8.0,6.0,11.0,10.0,5000000.0,0,2,2,0.0,0.0
10292,26,29,65,66,600,1000.0,1792,0,1.0,3.0,...,9.0,12.0,7.0,10.0,891000.0,0,0,2,0.0,0.0
6715,22,156,68,77,472,3000.0,1675,1,1.0,4.0,...,14.0,10.0,8.0,13.0,2400000.0,0,0,2,0.0,0.0
5908,22,114,69,76,248,2000.0,1115,1,1.0,3.0,...,64.0,64.0,68.0,69.0,1600000.0,0,2,2,0.0,1.0
14610,22,46,61,69,481,1000.0,1632,1,1.0,4.0,...,6.0,9.0,6.0,12.0,634000.0,0,2,2,0.0,0.0


In [38]:
def stepwise_selection(X, y, end_thres=0.05, add_thres = 0.01):
    '''
    Function for stepwise regression. Code based on 
    https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn
    while finding sklearn of stepwise regression.
    '''
    to_include = []
    while True:
        changed=False
        excluded = list(set(X.columns)-set(to_include))
        best_p_val = 1
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[to_include+[new_column]]))).fit()
            if model.pvalues[new_column] < best_p_val:
                best_p_val = model.pvalues[new_column]
                best_feature = new_column
        if best_p_val < add_thres:
            to_include.append(best_feature)
            changed=True
            
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[to_include]))).fit()
        pvalues = model.pvalues.iloc[1:]
        if any(pvalues > end_thres):
            changed=True
            worst_feature = pvalues.argmax()
            to_include.remove(worst_feature)
        if not changed:
            break
    return to_include

In [39]:
variables_to_include = stepwise_selection(X_train, y_train, end_thres=0.01, add_thres = 0.01)
variables_to_include

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


['Wage',
 'Release Clause',
 'Loaned Out',
 'no_rlse_clause',
 'International Reputation',
 'Overall',
 'Age',
 'SlidingTackle',
 'Potential',
 'Real Face',
 'FKAccuracy',
 'Position',
 'Stamina',
 'SprintSpeed']

In [40]:
fit_model = LinearRegression().fit(X_train[variables_to_include], y_train)

In [41]:
scores = cross_val_score(fit_model, X_train[variables_to_include], 
                         y_train, cv=5, scoring = "neg_median_absolute_error")
np.mean(scores)

-251935.90704614314