In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

%matplotlib inline

In [8]:
df_gkeeper_defender = pd.read_csv('gkeeper_defender.csv')
df_rest_players = pd.read_csv('rest_players.csv')

#### 1. Feature engineering steps:

1. Feature scaling: a method used to standardize the range of independent variables or features of data.
2. Aggregation: a process in data processing where summary statistics are calculated from data. 
3. One hot coding: a process used to convert categorical data variables so they can be provided to machine learning algorithms to improve predictions. 

In [9]:
#### Dataset 1: df_gkeeper_defender_copy modeling #####

In [17]:
df_gkeeper_defender.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,award,current_value
0,0,Manchester United,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,1.217252,0.335463,9390,42,13,15000000
1,1,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,1.242331,0.207055,1304,510,1,1500000
2,2,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,4,600000
3,3,Manchester United,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,6408,175,9,50000000
4,4,Manchester United,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,5031,238,21,40000000


In [18]:
df_rest_players.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,minutes played,days_injured,award,current_value
0,16,Manchester United,midfield-CentralMidfield,182.0,31.0,55,0.067214,0.313667,0.089619,0.0,0.0,4017,0,10,25000000
1,17,Manchester United,midfield-CentralMidfield,177.0,29.0,74,0.144046,0.086428,0.230474,0.0,0.0,3124,280,8,20000000
2,18,Manchester United,midfield-CentralMidfield,169.0,30.0,92,0.184843,0.221811,0.332717,0.0,0.0,4869,45,11,20000000
3,19,Manchester United,midfield-CentralMidfield,184.0,26.0,31,0.155575,0.0,0.155575,0.0,0.0,1157,378,6,17000000
4,20,Manchester United,midfield-CentralMidfield,181.0,20.0,39,0.129125,0.064562,0.225968,0.0,0.0,2788,0,1,800000


In [19]:
df_gkeeper_defender.drop('Unnamed: 0', axis=1, inplace=True)
df_rest_players.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
# first do one hot coding:

In [20]:
# dataset 1: df_gkeeper_defender
gkeeper_defender_categorical = ['team', 'position']
gkeeper_defender_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
                              'red cards','goals conceded', 'clean sheets', 'minutes played', 'days_injured', 'games_injured',
                             'award', 'current_value']


In [21]:
# dataset 2: rest_players
rest_players_categorical = ['team', 'position']
rest_players_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards', 
                              'red cards','goals conceded', 'minutes played', 'days_injured', 'games_injured',
                             'award', 'current_value']


In [22]:
gkeeper_defender_categorical

['team', 'position']

In [15]:
gkeeper_defender_numerical

['height',
 'age',
 'appearance',
 'goals',
 'assists',
 'yellow cards',
 'red cards',
 'goals conceded',
 'clean sheets',
 'minutes played',
 'days_injured',
 'games_injured',
 'award',
 'current_value']

In [23]:
# get_dummies function: a powerful tool for converting categorical variable(s) into dummy/indicator variables, which is essentially implementing one-hot encoding
df_gkeeper_defender = pd.get_dummies(df_gkeeper_defender, columns=gkeeper_defender_categorical)

In [24]:
# it cretated dummy varieables for categorical feature values 
df_gkeeper_defender.shape

(4698, 390)

In [25]:
# now we are going to correlate our data 
correlation = df_gkeeper_defender.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value                1.000000
appearance                   0.437273
minutes played               0.436572
award                        0.267146
team_Bayern Munich           0.205238
                               ...   
team_Daegu FC               -0.023182
team_Daejeon Hana Citizen   -0.024394
age                         -0.065629
goals conceded              -0.081208
position_Goalkeeper         -0.103531
Name: current_value, Length: 390, dtype: float64


In [27]:
# threeshold: a specific cutoff value used to determine the strength 
# of the relationship between two variables, 
# as measured by a correlation coefficient. 

threshold = 0.2

# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value']) > threshold]['current_value'].index
selected_features

Index(['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'],
      dtype='object')

In [None]:
selected_features = ['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'] 

In [28]:

df_gkeeper_defender = df_gkeeper_defender[selected_features]
df_gkeeper_defender.head()

Unnamed: 0,appearance,minutes played,award,current_value,team_Bayern Munich
0,104,9390,13,15000000,False
1,15,1304,1,1500000,False
2,4,292,4,600000,False
3,82,6408,9,50000000,False
4,63,5031,21,40000000,False


In [29]:
# preparing the data
x = df_gkeeper_defender.drop(['current_value'], axis=1)
y = df_gkeeper_defender['current_value']

In [31]:
# splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.25,
                                                   shuffle=True,
                                                   random_state=42)
# RS : will ensure that the same rows are selected for training and testing each time the code is executed.

# scaling data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [32]:
x.shape

(4698, 4)

In [33]:
#### Building the model ####

model = LinearRegression()

In [34]:
#### Training the model ####

## fit the model on the training data
model.fit(x_train_scaled, y_train)

In [35]:
y_pred_gk_dd = model.predict(x_train_scaled)

In [36]:
model.coef_

array([1612004.00536328, 1095397.34477944, 1051092.13089036,
        863075.04269504])

In [39]:
coeff_df_gk_dd = pd.DataFrame(model.coef_,x.columns, columns=['Coefficient'])
coeff_df_gk_dd

Unnamed: 0,Coefficient
appearance,1612004.0
minutes played,1095397.0
award,1051092.0
team_Bayern Munich,863075.0
