# Step04a_v1: Final feature selection and train, test, split for Classification
## All pitch types in 2019

## Import packages

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [11]:
infile = open('../data/processed/scherzer/scherzer_with_batters_2015_to_2019.pickle','rb')
pb = pickle.load(infile)
infile.close()

## Let's review all columns for final decisions of inclusion

In [12]:
pb.columns

Index(['pitch_type', 'game_date_x', 'sv_id', 'batter_id', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm', 'game_date_y', 'shift_date',
       'player_name', 'total_pitches', 'hits', 'abs', 'whiffs', 'swings',
       'takes', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive',
       'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso',
       'babip'],
      dtype='object')

## Two null values in if and of alignment
Will replace with standard

In [13]:
pb.if_fielding_alignment = pb.if_fielding_alignment.fillna('Standard')
pb.of_fielding_alignment = pb.of_fielding_alignment.fillna('Standard')

### Drop columns that are either duplicative or are not known before the pitch is thrown

In [14]:
pb = pb.drop(columns = ['game_date_x', 'sv_id', 'batter_id', 'home_team', 'release_speed', 'zone', 
                        'on_3b', 'on_2b', 'on_1b', 'release_spin_rate', 'game_date_y', 'shift_date', 
                        'player_name', 'hits'])

In [15]:
X = pb.drop(columns = 'pitch_type')

In [16]:
y = pb.pitch_type

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    random_state = 31, 
                                                    shuffle = True,
                                                    stratify = y)

## Export all types of pitches

In [19]:
pickle_out = open('../data/train_test_split/scherzer/X_train_2015.pickle', 'wb')
pickle.dump(X_train, pickle_out)
pickle_out.close()

pickle_out = open('../data/train_test_split/scherzer/X_test_2015.pickle', 'wb')
pickle.dump(X_test, pickle_out)
pickle_out.close()

pickle_out = open('../data/train_test_split/scherzer/y_train_2015_all_types.pickle', 'wb')
pickle.dump(y_train, pickle_out)
pickle_out.close()

pickle_out = open('../data/train_test_split/scherzer/y_test_2015_all_types.pickle', 'wb')
pickle.dump(y_test, pickle_out)
pickle_out.close()

## Add a second version of y_train that groups pitch types:
- Fourseam fastball becomes 'fastball'
- Changeup becomes 'changeup'
- All other pitches are grouped as 'movement' pitches

In [20]:
y_train_v2 = y_train
y_train_v2.value_counts()

FF    5924
SL    2251
CH    1581
CU     930
FC     633
FT      62
Name: pitch_type, dtype: int64

In [21]:
type(y_train_v2)

pandas.core.series.Series

In [22]:
y_train_v2 = y_train_v2.replace({'FF': 'fastball', 
                                 'CH': 'changeup', 
                                 'SL': 'movement', 
                                 'CU': 'movement', 
                                 'FC': 'movement'})
y_train_v2.value_counts()

fastball    5924
movement    3814
changeup    1581
FT            62
Name: pitch_type, dtype: int64

In [23]:
y_test_v2 = y_test
y_test_v2 = y_train_v2.replace({'FF': 'fastball', 
                                 'CH': 'changeup', 
                                 'SL': 'movement', 
                                 'CU': 'movement', 
                                 'FC': 'movement'})
y_test_v2.value_counts()

fastball    5924
movement    3814
changeup    1581
FT            62
Name: pitch_type, dtype: int64

## Export y data with three cases

In [24]:
pickle_out = open('../data/train_test_split/scherzer/y_train_2015_3_cases.pickle', 'wb')
pickle.dump(y_train_v2, pickle_out)
pickle_out.close()

pickle_out = open('../data/train_test_split/scherzer/y_test_2015_3_cases.pickle', 'wb')
pickle.dump(y_test_v2, pickle_out)
pickle_out.close()

## Add third version that has two cases (fastball or not fastball)

In [25]:
y_train_v3 = y_train
y_train_v3 = y_train_v3.replace({'FF': 'fastball', 
                                 'CH': 'other', 
                                 'SL': 'other', 
                                 'CU': 'other', 
                                 'FC': 'other'})
y_train_v3.value_counts()

fastball    5924
other       5395
FT            62
Name: pitch_type, dtype: int64

In [26]:
y_test_v3 = y_test
y_test_v3 = y_test_v3.replace({'FF': 'fastball', 
                                 'CH': 'other', 
                                 'SL': 'other', 
                                 'CU': 'other', 
                                 'FC': 'other'})
y_test_v3.value_counts()

fastball    2539
other       2312
FT            27
Name: pitch_type, dtype: int64

## Export y data with two cases

In [27]:
pickle_out = open('../data/train_test_split/scherzer/y_train_2015_2_cases.pickle', 'wb')
pickle.dump(y_train_v3, pickle_out)
pickle_out.close()

pickle_out = open('../data/train_test_split/scherzer/y_test_2015_2_cases.pickle', 'wb')
pickle.dump(y_test_v3, pickle_out)
pickle_out.close()