# Steb04a Combined Transformations and Train, Test, Split

## Import packages

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

## Import cleaned data

In [2]:
infile = open('../data/processed/scherzer/scherzer_with_batters_2019.pickle','rb')
pb = pickle.load(infile)
infile.close()

## Let's review all columns for final decisions of inclusion

In [3]:
pb.columns

Index(['pitch_type', 'game_date_x', 'sv_id', 'batter_id', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm', 'game_date_y', 'shift_date',
       'player_name', 'total_pitches', 'hits', 'abs', 'whiffs', 'swings',
       'takes', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive',
       'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso',
       'babip'],
      dtype='object')

## Two null values in if and of alignment
Will replace with standard

In [4]:
pb.if_fielding_alignment = pb.if_fielding_alignment.fillna('Standard')
pb.of_fielding_alignment = pb.of_fielding_alignment.fillna('Standard')

### Drop columns that are either duplicative or are not known before the pitch is thrown

In [5]:
pb = pb.drop(columns = ['game_date_x', 'sv_id', 'batter_id', 'home_team', 'release_speed', 'zone', 
                        'on_3b', 'on_2b', 'on_1b', 'release_spin_rate', 'game_date_y', 'shift_date', 
                        'player_name', 'hits'])

In [6]:
X = pb.drop(columns = 'pitch_type')

In [7]:
y = pb.pitch_type

## Define Categorical Variables

In [8]:
cats = ['stand', 'if_fielding_alignment', 'of_fielding_alignment', 'balls_strikes', 'all_runners']

## Define Features that are already standardized

In [9]:
formatted = ['nats_home1_away0', 'ba', 'slg', 'iso', 'babip']

## Define MinMax Numeric Variables

In [10]:
minmax = []
non_nums = cats + formatted

for c in X.columns:
    if c not in non_nums:
        minmax.append(c)

## Fit OHE

In [11]:
ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False)

In [12]:
X_ohe_ar = ohe.fit_transform(X[cats])

In [13]:
ohe_cols = ['stand_r1', 'if_standard', 'if_strategic', 'of_strategic', 
        '0_1', '0_2', '1_0', '1_1', '1_2', '2_0', '2_1', '2_2', '3_0', '3_1', '3_2',
        'fb:0_sb:0_tb:1', 'fb:0_sb:1_tb:0', 'fb:0_sb:1_tb:1', 'fb:1_sb:0_tb:0', 
        'fb:1_sb:0_tb:1', 'fb:1_sb:1_tb:0', 'fb:1_sb:1_tb:1']

In [14]:
ohe_df = pd.DataFrame(data = X_ohe_ar, columns = ohe_cols).reset_index()

## Fit StandardScaler

In [15]:
mm = MinMaxScaler()

In [16]:
X_mm = mm.fit_transform(X[minmax])

In [17]:
mm_df = pd.DataFrame(columns = minmax, data = X_mm).reset_index()

## Create DF for features that are already standardized

In [18]:
formatted_df = X[formatted].reset_index()

## Combine Processed X_test

In [19]:
ohe_df.shape, mm_df.shape, X[formatted].shape

((2770, 23), (2770, 26), (2770, 5))

In [20]:
ohe_df.shape[1] + mm_df.shape[1] + X[formatted].shape[1]

54

In [21]:
# X_trans = pd.DataFrame()
X_trans = pd.concat([mm_df, ohe_df], axis = 1)
X_trans = X_trans.drop(columns = 'index')
X_trans

Unnamed: 0,pitch_number,outs_when_up,inning,opp_score,nats_score,pitch_season,pitch_game,pitch_bat_gm,total_pitches,abs,...,3_0,3_1,3_2,fb:0_sb:0_tb:1,fb:0_sb:1_tb:0,fb:0_sb:1_tb:1,fb:1_sb:0_tb:0,fb:1_sb:0_tb:1,fb:1_sb:1_tb:0,fb:1_sb:1_tb:1
0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.090909,0.0,0.000000,0.0,0.000000,0.000361,0.008403,0.047619,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.181818,0.0,0.000000,0.0,0.000000,0.000722,0.016807,0.095238,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.5,0.000000,0.0,0.000000,0.001083,0.025210,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.090909,0.5,0.000000,0.0,0.000000,0.001445,0.033613,0.047619,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765,0.090909,1.0,0.714286,1.0,0.166667,0.998555,0.806723,0.523810,0.667940,0.747475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.181818,1.0,0.714286,1.0,0.166667,0.998917,0.815126,0.571429,0.667940,0.747475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2767,0.272727,1.0,0.714286,1.0,0.166667,0.999278,0.823529,0.619048,0.667940,0.747475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2768,0.363636,1.0,0.714286,1.0,0.166667,0.999639,0.831933,0.666667,0.667940,0.747475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Process y data where each pitch type is its own class (all classes or ac)

In [22]:
y = np.array(y).reshape(-1, 1)

In [23]:
pitch_type_dict = {'FF': 0, 'FC': 1, 'SL': 2, 'CU': 3, 'CH': 4}

In [24]:
y_ord_ac = y

for pitch in pitch_type_dict:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_ac = np.where(y_ord_ac == pitch, pitch_type_dict[pitch], y_ord_ac)

In [25]:
y_ord_ac[:10], y[:10];

## Process y data grouping pitches into three classes (or 3c)

In [26]:
pitch_3_types = {'FF': 0, 'FC': 1, 'SL': 1, 'CU': 1, 'CH':2}

In [27]:
y_ord_3c = y

for pitch in pitch_3_types:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_3c = np.where(y == pitch, pitch_3_types[pitch], y_ord_3c)

In [28]:
y_ord_3c[:10], y[:10];

## Process y data grouping pitches into two classes (or 2c)

In [29]:
pitch_2_types = {'FF': 0, 'FC': 1, 'SL': 1, 'CU': 1, 'CH':1}

In [30]:
y_ord_2c = y

for pitch in pitch_3_types:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_2c = np.where(y == pitch, pitch_2_types[pitch], y_ord_2c)

In [31]:
y_ord_2c[:10], y[:10];

## Train, Test, Split

In [32]:
# All classes flavor
X_train_ac, X_test_ac, y_train_ac, y_test_ac = train_test_split(X_trans,
                                                                y_ord_ac,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

In [33]:
X_train_3c, X_test_3c, y_train_3c, y_test_3c = train_test_split(X_trans,
                                                                y_ord_3c,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

In [34]:
X_train_2c, X_test_2c, y_train_2c, y_test_2c = train_test_split(X_trans,
                                                                y_ord_2c,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

## Export Data

In [35]:
# All classes flavor
filepath = '../data/train_test_split/scherzer/2019_all_classes/'

pickle_out = open(filepath + 'X_train_ac.pickle', 'wb')
pickle.dump(X_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_ac.pickle', 'wb')
pickle.dump(X_test_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_ac.pickle', 'wb')
pickle.dump(y_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_ac.pickle', 'wb')
pickle.dump(y_test_ac, pickle_out)
pickle_out.close()

In [36]:
# Three(3) classes flavor
filepath = '../data/train_test_split/scherzer/2019_three_classes/'

pickle_out = open(filepath + 'X_train_3c.pickle', 'wb')
pickle.dump(X_train_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_3c.pickle', 'wb')
pickle.dump(X_test_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_3c.pickle', 'wb')
pickle.dump(y_train_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_3c.pickle', 'wb')
pickle.dump(y_test_3c, pickle_out)
pickle_out.close()

In [37]:
# Three(3) classes flavor
filepath = '../data/train_test_split/scherzer/2019_two_classes/'

pickle_out = open(filepath + 'X_train_2c.pickle', 'wb')
pickle.dump(X_train_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_2c.pickle', 'wb')
pickle.dump(X_test_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_2c.pickle', 'wb')
pickle.dump(y_train_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_2c.pickle', 'wb')
pickle.dump(y_test_2c, pickle_out)
pickle_out.close()