# Step04a_v1: Final feature selection and train, test, split for Classification
## All pitch types in 2019

## Import packages

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [11]:
infile = open('../data/processed/scherzer/scherzer_with_batters_2019.pickle','rb')
pb = pickle.load(infile)
infile.close()

## Let's review all columns for final decisions of inclusion

In [12]:
pb.columns

Index(['pitch_type', 'game_date_x', 'sv_id', 'batter_id', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm', 'game_date_y', 'shift_date',
       'player_name', 'total_pitches', 'hits', 'abs', 'whiffs', 'swings',
       'takes', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive',
       'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso',
       'babip'],
      dtype='object')

## Two null values in if and of alignment
Will replace with standard

In [13]:
pb.if_fielding_alignment = pb.if_fielding_alignment.fillna('Standard')
pb.of_fielding_alignment = pb.of_fielding_alignment.fillna('Standard')

### Drop columns that are either duplicative or are not known before the pitch is thrown

In [14]:
pb = pb.drop(columns = ['game_date_x', 'sv_id', 'batter_id', 'home_team', 'release_speed', 'zone', 
                        'on_3b', 'on_2b', 'on_1b', 'release_spin_rate', 'game_date_y', 'shift_date', 
                        'player_name', 'hits'])

In [6]:
X = pb.drop(columns = 'pitch_type')

In [7]:
y = pb.pitch_type

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    random_state = 31, 
                                                    shuffle = True,
                                                    stratify = y)

## Export all types of pitches

In [9]:
filepath = '../data/train_test_split/scherzer/raw/'

pickle_out = open(filepath + 'X_train_2019.pickle', 'wb')
pickle.dump(X_train, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_2019.pickle', 'wb')
pickle.dump(X_test, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_2019_all_types.pickle', 'wb')
pickle.dump(y_train, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_2019_all_types.pickle', 'wb')
pickle.dump(y_test, pickle_out)
pickle_out.close()

## Add a second version of y_train that groups pitch types:
- Fourseam fastball becomes 'fastball'
- Changeup becomes 'changeup'
- All other pitches are grouped as 'movement' pitches

In [51]:
y_train_2c = y_train
y_train_2c.value_counts()

FF    939
SL    401
CH    280
CU    169
FC    150
Name: pitch_type, dtype: int64

In [52]:
type(y_train_2c)

pandas.core.series.Series

In [53]:
y_train_2c = y_train_2c.replace({'FF': 'fastball', 
                                 'CH': 'changeup', 
                                 'SL': 'movement', 
                                 'CU': 'movement', 
                                 'FC': 'movement'})
y_train_2c.value_counts()

fastball    939
movement    720
changeup    280
Name: pitch_type, dtype: int64

In [54]:
y_test_2c = y_test
y_test_2c = y_train_2c.replace({'FF': 'fastball', 
                                 'CH': 'changeup', 
                                 'SL': 'movement', 
                                 'CU': 'movement', 
                                 'FC': 'movement'})
y_test_2c.value_counts()

fastball    939
movement    720
changeup    280
Name: pitch_type, dtype: int64

## Export y data with three cases

In [55]:
filepath = '../data/train_test_split/scherzer/raw/'

pickle_out = open(filepath + 'y_train_2019_3_cases.pickle', 'wb')
pickle.dump(y_train_2c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_2019_3_cases.pickle', 'wb')
pickle.dump(y_test_2c, pickle_out)
pickle_out.close()

## Add third version that has two cases (fastball or not fastball)

In [56]:
y_train_3c = y_train
y_train_3c = y_train_3c.replace({'FF': 'fastball', 
                                 'CH': 'other', 
                                 'SL': 'other', 
                                 'CU': 'other', 
                                 'FC': 'other'})
y_train_3c.value_counts()

other       1000
fastball     939
Name: pitch_type, dtype: int64

In [57]:
y_test_3c = y_test
y_test_3c = y_test_3c.replace({'FF': 'fastball', 
                                 'CH': 'other', 
                                 'SL': 'other', 
                                 'CU': 'other', 
                                 'FC': 'other'})
y_test_3c.value_counts()

other       429
fastball    402
Name: pitch_type, dtype: int64

## Export y data with two cases

In [58]:
filepath = '../data/train_test_split/scherzer/raw/'

pickle_out = open(filepath + 'y_train_2019_2_cases.pickle', 'wb')
pickle.dump(y_train_3c, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_2019_2_cases.pickle', 'wb')
pickle.dump(y_test_3c, pickle_out)
pickle_out.close()