In [1]:
import random

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter('ignore')

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


## Initial Data

In [2]:
# obtains data, gets the head of the data
data = pd.read_csv("../data/data_cleaned.csv")
data.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred_Foot,International_Reputation,...,GKReflexes,Release_Clause,League_Country,League_Level_Pyramid,League_Revenue,League_Revenue_Per_Team,Is_Goalkeeper,Loaned_Out,Off_Work_Rate,Def_Work_Rate
0,31,Argentina,94,94,FC Barcelona,110.5,0.565,2202,Left,5.0,...,8.0,226.5,Spain,1.0,4479.0,223.95,0,0,Medium,Medium
1,33,Portugal,94,94,Juventus,77.0,0.405,2228,Right,5.0,...,11.0,127.1,Italy,1.0,2163.0,108.2,0,0,High,Low
2,26,Brazil,92,93,Paris Saint-Germain,118.5,0.29,2143,Right,5.0,...,11.0,228.1,France,1.0,1692.0,84.6,0,0,High,Medium
3,27,Spain,91,93,Manchester United,72.0,0.26,1471,Right,4.0,...,94.0,138.6,England,1.0,6562.0,328.1,1,0,Medium,Medium
4,27,Belgium,91,92,Manchester City,102.0,0.355,2281,Right,4.0,...,13.0,196.4,England,1.0,6562.0,328.1,0,0,High,High


In [3]:
# Converts nationality, club, preferred foot, position, real face, body type, loaded out
# Off_Work_Rate and Def_Work_Rate to numeric categories
cate = ["Nationality", "Club", "Preferred_Foot", "Real_Face", "Position"
        , "Loaned_Out", "Off_Work_Rate", "Def_Work_Rate", "League_Country"]
def cate_to_numeric(cate):
    data[cate] = data[cate].astype('category')
    data[cate] = data[cate].cat.codes

for cat in cate:
    cate_to_numeric(cat)

In [4]:
# Sorts release clause
data['Release_Clause'] = data['Release_Clause'].fillna(0)
data.loc[data.Release_Clause == 0,"has_release_clause"] = 0
data.loc[data.Release_Clause != 0,"has_release_clause"] = 1

In [5]:
# splits into training, testing, and validation data sets
y = data.pop('Value')
X = data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)
X_train.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Wage,Special,Preferred_Foot,International_Reputation,Weak_Foot,...,Release_Clause,League_Country,League_Level_Pyramid,League_Revenue,League_Revenue_Per_Team,Is_Goalkeeper,Loaned_Out,Off_Work_Rate,Def_Work_Rate,has_release_clause
2746,32,138,73,73,366,0.008,1875,1,1.0,4.0,...,5.0,19,1.0,509.0,28.0,0,0,2,2,1.0
10127,26,29,65,66,600,0.001,1792,0,1.0,3.0,...,0.891,6,1.0,113.0,6.3,0,0,0,2,1.0
6601,22,156,68,77,472,0.003,1675,1,1.0,4.0,...,2.4,33,1.0,851.0,38.7,0,0,0,2,1.0
5813,22,114,69,76,248,0.002,1115,1,1.0,3.0,...,1.6,21,1.0,152.0,9.5,1,0,2,2,1.0
14349,22,46,61,69,481,0.001,1632,1,1.0,4.0,...,0.634,12,3.0,171.0,7.1,0,0,2,2,1.0


## Stepwise Selection

In [6]:
def stepwise_selection(X, y, end_thres=0.05, add_thres = 0.01):
    '''
    Function for stepwise regression. Code based on 
    https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn
    while finding sklearn of stepwise regression.
    X - X data
    y - Y data
    end_thres - Threshould for dropping variables from stepwise selection
    add_thres - Threshould for adding variable from stepwise selection
    '''
    to_include = []
    while True:
        changed=False
        excluded = list(set(X.columns)-set(to_include))
        best_p_val = 1
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[to_include+[new_column]]))).fit()
            if model.pvalues[new_column] < best_p_val:
                best_p_val = model.pvalues[new_column]
                best_feature = new_column
        if best_p_val < add_thres:
            to_include.append(best_feature)
            changed=True
            
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[to_include]))).fit()
        pvalues = model.pvalues.iloc[1:]
        if any(pvalues > end_thres):
            changed=True
            worst_feature = pvalues.argmax()
            to_include.remove(worst_feature)
        if not changed:
            break
    return to_include

In [7]:
# Performs stepwise selection, displays variables to include
variables_to_include = stepwise_selection(X_train, y_train, end_thres=0.01, add_thres = 0.01)
variables_to_include

['Release_Clause',
 'Wage',
 'has_release_clause',
 'Overall',
 'International_Reputation',
 'Age',
 'League_Revenue',
 'League_Revenue_Per_Team',
 'SlidingTackle',
 'Potential',
 'Position',
 'Volleys',
 'Loaned_Out',
 'SprintSpeed',
 'Stamina',
 'ShotPower',
 'FKAccuracy',
 'Penalties',
 'Marking']

## Model Fitting and Cross Validation

In [8]:
# Fits model, calculates median absolute error for enture training dataset
fit_model = LinearRegression().fit(X_train[variables_to_include], y_train)
y_pred = fit_model.predict(X_train[variables_to_include])
median_absolute_error(y_train,y_pred)

0.2618244222930952

In [9]:
# Does Cross Validation, uses negative median absolute error
scores = cross_val_score(fit_model, X_train[variables_to_include], 
                         y_train, cv=5, scoring = "neg_median_absolute_error")
print(scores)
print(np.mean(scores))

[-0.27275082 -0.24773677 -0.26500705 -0.26228701 -0.25787282]
-0.2611308931889696


In [10]:
# Calculates median absolute error for test dataset
y_pred = fit_model.predict(X_test[variables_to_include])
median_absolute_error(y_test,y_pred)

0.256069337310371