# Step04a_v2: Final feature selection and train, test, split for LSTM

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
infile = open('../data/processed/scherzer_with_batters.pickle','rb')
pb = pickle.load(infile)
infile.close()

## Let's review all columns for final decisions of inclusion

In [3]:
pb.columns

Index(['pitch_type', 'game_date_x', 'sv_id', 'batter_id', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm', 'game_date_y', 'shift_date',
       'player_name', 'total_pitches', 'hits', 'abs', 'whiffs', 'swings',
       'takes', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive',
       'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso',
       'babip'],
      dtype='object')

## Two null values in if and of alignment
Will replace with standard

In [4]:
pb.if_fielding_alignment = pb.if_fielding_alignment.fillna('Standard')
pb.of_fielding_alignment = pb.of_fielding_alignment.fillna('Standard')

## For LSTM, need to retain some sort of temporal values
- Plan to look at sequences per batter
- Need to discover max batters faced in a game
- Plan to append set second or minute values to dates

In [None]:
pb.groupby(by=columns['game_date_x', ])

In [6]:
pb.head(3)

Unnamed: 0,pitch_type,game_date_x,sv_id,batter_id,pitch_number,release_speed,zone,stand,home_team,on_3b,...,line_drive,ground_ball,fly_ball,popup,rbi,sac,ba,slg,iso,babip
0,FF,2019-03-28,190328_170717,607043,0,93.7,6.0,L,WSH,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,FF,2019-03-28,190328_170732,607043,1,94.2,5.0,L,WSH,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,FF,2019-03-28,190328_170752,607043,2,96.3,5.0,L,WSH,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


### Drop columns that are either duplicative or are not known before the pitch is thrown

In [5]:
pb = pb.drop(columns = ['game_date_x', 'sv_id', 'batter_id', 'home_team', 'release_speed', 'zone', 
                        'on_3b', 'on_2b', 'on_1b', 'release_spin_rate', 'game_date_y', 'shift_date', 
                        'player_name', 'hits'])

In [6]:
X = pb.drop(columns = 'pitch_type')

In [7]:
y = pb.pitch_type

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    random_state = 31, 
                                                    shuffle = True,
                                                    stratify = y)

## Export Data

In [9]:
pickle_out = open('../data/processed/X_train.pickle', 'wb')
pickle.dump(X_train, pickle_out)

pickle_out = open('../data/processed/X_test.pickle', 'wb')
pickle.dump(X_test, pickle_out)

### Export y_train (original), y_train_sm (smote), and y_test
pickle_out = open('../data/processed/y_train.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/processed/y_test.pickle', 'wb')
pickle.dump(y_test, pickle_out)