# Steb04a Combined Transformations and Train, Test, Split

## Import packages

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

## Import cleaned data

In [None]:
infile = open('../data/processed/pitchers/pitchers_with_batters_2019.pickle','rb')
pb = pickle.load(infile)
infile.close()

## Let's review all columns for final decisions of inclusion

In [None]:
pb.columns

## Two null values in if and of alignment
Will replace with standard

In [None]:
pb.if_fielding_alignment = pb.if_fielding_alignment.fillna('Standard')
pb.of_fielding_alignment = pb.of_fielding_alignment.fillna('Standard')

### Drop columns that are either duplicative or are not known before the pitch is thrown

In [None]:
pb = pb.drop(columns = ['game_date_x', 'sv_id', 'batter_id', 'home_team', 'release_speed', 'zone', 
                        'on_3b', 'on_2b', 'on_1b', 'release_spin_rate', 'game_date_y', 'shift_date', 
                        'player_name', 'hits'])

In [None]:
X = pb.drop(columns = 'pitch_type')

In [None]:
y = pb.pitch_type

## Define Categorical Variables

In [None]:
cats = ['stand', 'if_fielding_alignment', 'of_fielding_alignment', 'balls_strikes', 'all_runners']

## Define Features that are already standardized

In [None]:
formatted = ['nats_home1_away0', 'ba', 'slg', 'iso', 'babip']

## Define MinMax Numeric Variables

In [None]:
minmax = []
non_nums = cats + formatted

for c in X.columns:
    if c not in non_nums:
        minmax.append(c)

## Fit OHE

In [None]:
ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False)

In [None]:
X_ohe_ar = ohe.fit_transform(X[cats])

In [None]:
ohe.categories_

In [None]:
ohe_cols = ['stand_r1', 'if_standard', 'if_strategic', 'of_strategic', 
        '0_1', '0_2', '1_0', '1_1', '1_2', '2_0', '2_1', '2_2', '3_0', '3_1', '3_2',
        'fb:0_sb:0_tb:1', 'fb:0_sb:1_tb:0', 'fb:0_sb:1_tb:1', 'fb:1_sb:0_tb:0', 
        'fb:1_sb:0_tb:1', 'fb:1_sb:1_tb:0', 'fb:1_sb:1_tb:1']

In [None]:
ohe_df = pd.DataFrame(data = X_ohe_ar, columns = ohe_cols).reset_index()

## Fit StandardScaler

In [None]:
mm = MinMaxScaler()

In [None]:
X_mm = mm.fit_transform(X[minmax])

In [None]:
mm_df = pd.DataFrame(columns = minmax, data = X_mm).reset_index()

## Create DF for features that are already standardized

In [None]:
formatted_df = X[formatted].reset_index()

## Combine Processed X_test

In [None]:
ohe_df.shape, mm_df.shape, X[formatted].shape

In [None]:
ohe_df.shape[1] + mm_df.shape[1] + X[formatted].shape[1]

In [None]:
# X_trans = pd.DataFrame()
X_trans = pd.concat([mm_df, ohe_df], axis = 1)
X_trans = X_trans.drop(columns = 'index')
X_trans

## Process y data where each pitch type is its own class (all classes or ac)

In [None]:
y = np.array(y).reshape(-1, 1)

In [None]:
pitch_type_dict = {'FF': 0, 'FC': 1, 'SL': 2, 'CU': 3, 'CH': 4}

In [None]:
y_ord_ac = y

for pitch in pitch_type_dict:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_ac = np.where(y_ord_ac == pitch, pitch_type_dict[pitch], y_ord_ac)

In [None]:
y_ord_ac[:10], y[:10];

## Process y data grouping pitches into three classes (or 3c)

In [None]:
pitch_3_types = {'FF': 0, 'FC': 1, 'SL': 1, 'CU': 1, 'CH':2}

In [None]:
y_ord_3c = y

for pitch in pitch_3_types:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_3c = np.where(y == pitch, pitch_3_types[pitch], y_ord_3c)

In [None]:
y_ord_3c[:10], y[:10];

## Process y data grouping pitches into two classes (or 2c)

In [None]:
pitch_2_types = {'FF': 0, 'FC': 1, 'SL': 1, 'CU': 1, 'CH':1}

In [None]:
y_ord_2c = y

for pitch in pitch_3_types:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_2c = np.where(y == pitch, pitch_2_types[pitch], y_ord_2c)

In [None]:
y_ord_2c[:10], y[:10];

## Train, Test, Split

In [None]:
# All classes flavor
X_train_ac, X_test_ac, y_train_ac, y_test_ac = train_test_split(X_trans,
                                                                y_ord_ac,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

In [None]:
X_train_3c, X_test_3c, y_train_3c, y_test_3c = train_test_split(X_trans,
                                                                y_ord_3c,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

In [None]:
X_train_2c, X_test_2c, y_train_2c, y_test_2c = train_test_split(X_trans,
                                                                y_ord_2c,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

## Export Data

In [None]:
# All classes flavor
filepath = '../data/train_test_split/scherzer/2019_all_classes/'

pickle_out = open(filepath + 'X_train_ac.pickle', 'wb')
pickle.dump(X_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_ac.pickle', 'wb')
pickle.dump(X_test_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_ac.pickle', 'wb')
pickle.dump(y_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_ac.pickle', 'wb')
pickle.dump(y_test_ac, pickle_out)
pickle_out.close()

In [None]:
# Three(3) classes flavor
filepath = '../data/train_test_split/scherzer/2019_three_classes/'

pickle_out = open(filepath + 'X_train_3c.pickle', 'wb')
pickle.dump(X_train_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_3c.pickle', 'wb')
pickle.dump(X_test_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_3c.pickle', 'wb')
pickle.dump(y_train_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_3c.pickle', 'wb')
pickle.dump(y_test_3c, pickle_out)
pickle_out.close()

In [None]:
# Three(3) classes flavor
filepath = '../data/train_test_split/scherzer/2019_two_classes/'

pickle_out = open(filepath + 'X_train_2c.pickle', 'wb')
pickle.dump(X_train_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_2c.pickle', 'wb')
pickle.dump(X_test_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_2c.pickle', 'wb')
pickle.dump(y_train_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_2c.pickle', 'wb')
pickle.dump(y_test_2c, pickle_out)
pickle_out.close()