# Feature Scaling

<hr>

## Goal of Project
- A sport magazine is writing an article on soccer players
- They have special interest in left-footed players
- Determine whether playing style can predict if player is left-footed

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

In [2]:
# read the data
data = pd.read_parquet('./data/soccer.parquet')
data.head()

Unnamed: 0_level_0,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [4]:
data = data[:2000]

In [5]:
data.columns

Index(['player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [6]:
len(data)

2000

In [7]:
# check for null values
data.isna().sum()

player_fifa_api_id      0
player_api_id           0
date                    0
overall_rating          3
potential               3
preferred_foot          3
attacking_work_rate    43
defensive_work_rate     3
crossing                3
finishing               3
heading_accuracy        3
short_passing           3
volleys                28
dribbling               3
curve                  28
free_kick_accuracy      3
long_passing            3
ball_control            3
acceleration            3
sprint_speed            3
agility                28
reactions               3
balance                28
shot_power              3
jumping                28
stamina                 3
strength                3
long_shots              3
aggression              3
interceptions           3
positioning             3
vision                 28
penalties               3
marking                 3
standing_tackle         3
sliding_tackle         28
gk_diving               3
gk_handling             3
gk_kicking  

In [8]:
data = data.dropna()
len(data)

1957

In [9]:
data.dtypes.sort_values()

player_fifa_api_id       int64
player_api_id            int64
shot_power             float64
jumping                float64
stamina                float64
strength               float64
long_shots             float64
aggression             float64
interceptions          float64
positioning            float64
vision                 float64
penalties              float64
marking                float64
standing_tackle        float64
sliding_tackle         float64
gk_diving              float64
gk_handling            float64
gk_kicking             float64
balance                float64
reactions              float64
agility                float64
sprint_speed           float64
overall_rating         float64
potential              float64
crossing               float64
finishing              float64
gk_positioning         float64
short_passing          float64
heading_accuracy       float64
dribbling              float64
curve                  float64
free_kick_accuracy     float64
long_pas

In [None]:
X = data.select_dtypes(include='number').drop(['player_fifa_api_id','player_api_id'],axis=1)
y = data['preferred_foot']

In [None]:
X.head()

In [None]:
# transform the y

y = np.array([1 if value == 'right' else 0 for value in y])

In [None]:
# split into train and test set

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=42)

## Normalize

In [None]:
norm = MinMaxScaler().fit(X_train)

X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

In [None]:
pd.DataFrame(X_train_norm).describe()

## Standardize

In [None]:
stand = StandardScaler().fit(X_train)

X_train_stand = stand.transform(X_train)
X_test_stand = stand.transform(X_test)

In [None]:
pd.DataFrame(X_train_stand).describe()

In [None]:
pd.DataFrame(X_train_stand).describe().round(2)

## Machine Learning Model

In [None]:
# score = []

# X_trains = [X_train,X_train_norm,X_train_stand]
# X_tests = [X_test,X_test_norm,X_test_stand]

# for train, test in zip(X_trains,X_tests):
#     svc = SVC()
    
#     svc.fit(train,y_train)
#     y_pred = svc.predict(test)
    
#     score.append(accuracy_score(y_test,y_pred))
    
# df_var = pd.DataFrame({'Accuracy score': score},index=['Original','Normalized','Standardized'])
# df_var