# Steb04b Transformations

## Import packages

In [26]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [27]:
infile1 = open('../data/processed/X_train.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/y_train.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

In [28]:
# Drop some features after initial round of fitting classification models
X_train2 = X_train.drop(columns = ['pitch_season', 'pitch_game'])

In [30]:
X_train2.columns

Index(['pitch_number', 'stand', 'outs_when_up', 'inning', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_bat_gm',
       'total_pitches', 'abs', 'whiffs', 'swings', 'takes', 'k', 'walk',
       'single', 'double', 'triple', 'hr', 'line_drive', 'ground_ball',
       'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso', 'babip'],
      dtype='object')

## Define Categorical Variables

In [31]:
cats = ['stand', 'if_fielding_alignment', 'of_fielding_alignment', 'balls_strikes', 'all_runners']

## Define Features that are already standardized

In [32]:
formatted = ['nats_home1_away0']

## Define MinMax Numeric Variables

In [33]:
minmax = ['outs_when_up', 'inning', 'ba', 'slg', 'iso', 'babip', 'pitch_number', 'pitch_bat_gm']

## Define StandardScaler Numeric Variables

In [35]:
standard = []
non_nums = cats + standard + minmax

for c in X_train2.columns:
    if c not in non_nums:
        standard.append(c)

## Fit OHE

In [36]:
ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False)

In [37]:
X_ohe_ar = ohe.fit_transform(X_train2[cats])

In [38]:
ohe_cols = ['stand_r1', 'if_standard', 'if_strategic', 'of_strategic', 
        '0_1', '0_2', '1_0', '1_1', '1_2', '2_0', '2_1', '2_2', '3_0', '3_1', '3_2',
        'fb:0_sb:0_tb:1', 'fb:0_sb:1_tb:0', 'fb:0_sb:1_tb:1', 'fb:1_sb:0_tb:0', 
        'fb:1_sb:0_tb:1', 'fb:1_sb:1_tb:0', 'fb:1_sb:1_tb:1']

In [39]:
ohe_df = pd.DataFrame(data = X_ohe_ar, columns = ohe_cols).reset_index()

## Fit StandardScaler

In [40]:
ss = StandardScaler()

In [42]:
X_ss = ss.fit_transform(X_train2[standard])

In [43]:
ss_df = pd.DataFrame(columns = standard, data = X_ss).reset_index()

## Fit MinMax Scaler

In [44]:
mm = MinMaxScaler()

In [45]:
X_mm = mm.fit_transform(X_train2[minmax])

In [46]:
mm_df = pd.DataFrame(columns = minmax, data = X_mm).reset_index()

## Create DF for features that are already standardized

In [47]:
formatted_df = X_train2[formatted].reset_index()

## Combine Processed X_train2

In [48]:
ohe_df.shape, ss_df.shape, mm_df.shape, X_train2[formatted].shape

((1939, 23), (1939, 21), (1939, 9), (1939, 1))

In [49]:
ohe_df.shape[1] + ss_df.shape[1] + mm_df.shape[1] + X_train2[formatted].shape[1]

54

In [50]:
# X_train2_trans = pd.DataFrame()
X_train2_trans = pd.concat([mm_df, ss_df, ohe_df], axis = 1)
X_train2_trans = X_train2_trans.drop(columns = 'index')
X_train2_trans

Unnamed: 0,outs_when_up,inning,ba,slg,iso,babip,pitch_number,pitch_bat_gm,opp_score,nats_score,...,3_0,3_1,3_2,fb:0_sb:0_tb:1,fb:0_sb:1_tb:0,fb:0_sb:1_tb:1,fb:1_sb:0_tb:0,fb:1_sb:0_tb:1,fb:1_sb:1_tb:0,fb:1_sb:1_tb:1
0,0.0,0.714286,0.604478,0.389552,0.211087,0.307692,0.090909,0.30,-0.011533,0.274116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.5,0.714286,0.750000,0.450000,0.214286,1.000000,0.090909,0.25,-0.011533,0.274116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,0.142857,0.616548,0.406762,0.228775,0.343137,0.181818,0.10,-0.839770,-0.816163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.428571,0.585799,0.543195,0.441251,0.274510,0.000000,0.25,-0.839770,-0.271023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.5,0.714286,0.679245,0.356604,0.121294,0.313725,0.000000,0.35,2.473179,0.819255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1934,0.0,0.428571,0.585799,0.543195,0.441251,0.274510,0.181818,0.35,-0.839770,-0.271023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1935,0.5,0.142857,0.571942,0.399281,0.243577,0.290850,0.454545,0.25,-0.011533,0.819255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1936,0.5,0.285714,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,-0.011533,-0.816163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1937,1.0,0.285714,0.557432,0.379054,0.222973,0.298077,0.090909,0.30,-0.011533,-0.816163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Process y_train data

In [51]:
y_train.value_counts()
y_train = np.array(y_train).reshape(-1, 1)

In [52]:
o = OrdinalEncoder()
y_train_trans = o.fit_transform(y_train)

## Export data

In [55]:
pickle_out = open('../data/processed/X_train2_trans.pickle', 'wb')
pickle.dump(X_train2_trans, pickle_out)
pickle_out.close()

pickle_out = open('../data/processed/y_train_trans.pickle', 'wb')
pickle.dump(y_train_trans, pickle_out)
pickle_out.close()