In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import itertools

In [2]:
infile1 = open('../data/processed/batters/2019_batters_date_shifted.pickle','rb')
batters = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/batters/hitter_clusters.pickle','rb')
clusters = pickle.load(infile2)
infile2.close()

infile2 = open('../data/processed/pitchers/cleaned.pickle','rb')
pitchers = pickle.load(infile2)
infile2.close()

## Clean up batters a bit

In [3]:
batters = batters.fillna(0)

## Add batter cluster values

In [4]:
batters.player_id = batters.player_id.apply(lambda b: str(b))

In [5]:
batters = batters.merge(clusters, how = 'inner', left_on='player_id', right_on='player_id')

## Merge batter data with pitcher data

In [6]:
pitchers.batter = pitchers.batter.apply(lambda b: str(b))
pitchers.pitcher = pitchers.pitcher.apply(lambda b: str(b))

In [7]:
pb = pitchers.merge(batters, 
                   how = 'inner', 
                   left_on = ['game_date', 'batter'], 
                   right_on = ['shift_date', 'player_id'])

In [8]:
pb['pitcher_batter'] = pb.pitcher + pb.batter

In [9]:
pitcher_ids = pb.pitcher

## Fix up column names and drop those not needed

In [10]:
pb = pb.drop(columns = ['game_date_x', 'game_year', 'shift_date', 'player_id', 'game_date_y', 'player_name_x', 
                        'pitcher', 'player_name_y', 'batter', 'pitch_season', 'total_pitches'])

In [11]:
pb.describe().T;

## Assign columns to transformation types

In [12]:
X = pb.drop(columns = ['pitch_type'])
y = pb.pitch_type

In [13]:
# OneHotEncoder columns
cats = ['balls', 'strikes', 'outs_when_up', 'inning', 'home_score', 'away_score', 'cluster']

# MinMaxScalar columns
minmax = ['pitch_game', 'pitch_bat_gm', 'pitch_ab', 'hits', 'abs', 'whiffs', 'swings', 'takes', 
          'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive', 'ground_ball', 'fly_ball', 
          'popup', 'rbi', 'sac', 'slg', 'iso']

# Already standardized
standardized = ['on_3b', 'on_2b', 'on_1b', 'stand_0right', 'pitcher_home0', 'p_throws_0right', 'ba', 'babip']

## Fit OHE

In [14]:
ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False)
X_ohe_ar = ohe.fit_transform(X[cats])

In [15]:
ohe_dict = {}
counter = 0

for c in cats:
    ohe_dict.update({'x{}'.format(counter): c})
    counter += 1

ohe_cols = []   
    
for oc in ohe.get_feature_names():
    ohe_cols.append(oc.replace(oc[0:2], ohe_dict[oc[0:2]]))

In [16]:
ohe_df = pd.DataFrame(data = X_ohe_ar, columns = ohe_cols)

## Fit MinMax

In [17]:
mm = MinMaxScaler()
X_mm = mm.fit_transform(X[minmax])
mm_df = pd.DataFrame(columns = minmax, data = X_mm)

## DF for already standardized data

In [18]:
formatted_df = X[standardized]

## Recombine X data

In [19]:
ohe_df.shape, mm_df.shape, formatted_df.shape

((595412, 71), (595412, 22), (595412, 8))

In [20]:
X_trans = pd.concat([mm_df, ohe_df, formatted_df], axis = 1)
X_trans

Unnamed: 0,pitch_game,pitch_bat_gm,pitch_ab,hits,abs,whiffs,swings,takes,k,walk,...,cluster_2,cluster_3,on_3b,on_2b,on_1b,stand_0right,pitcher_home0,p_throws_0right,ba,babip
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0,0,0,1,0,0,0.000000,0.00
1,0.007692,0.037037,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0,0,0,1,0,0,0.000000,0.00
2,0.015385,0.074074,0.133333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0,0,0,1,0,0,0.000000,0.00
3,0.253846,0.111111,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0,0,0,1,0,0,0.000000,0.00
4,0.261538,0.148148,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0,0,0,1,0,0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595407,0.061538,0.222222,0.400000,0.019417,0.023564,0.013187,0.023158,0.024315,0.015873,0.026087,...,0.0,1.0,0,1,1,0,1,0,0.250000,0.25
595408,0.053846,0.000000,0.000000,0.009709,0.008837,0.004396,0.006316,0.008277,0.005291,0.008696,...,0.0,1.0,0,0,0,0,0,0,0.333333,0.40
595409,0.061538,0.037037,0.066667,0.009709,0.008837,0.004396,0.006316,0.008277,0.005291,0.008696,...,0.0,1.0,0,0,0,0,0,0,0.333333,0.40
595410,0.069231,0.074074,0.133333,0.009709,0.008837,0.004396,0.006316,0.008277,0.005291,0.008696,...,0.0,1.0,0,0,0,0,0,0,0.333333,0.40


## Ordinal encode y data

In [21]:
pd.DataFrame(y).pitch_type.value_counts()

FF    213059
SL    101181
CH     67135
CU     53940
FT     53141
SI     41952
FC     41409
KC     14055
FS      9540
Name: pitch_type, dtype: int64

In [22]:
y = np.array(y).reshape(-1, 1)
y_ord_ac = y

pitch_type_dict = {'FF': 0, 'CH': 1, 'SL': 2, 'CU': 2, 'FT': 2, 'SI': 2, 'FC': 2, 'KC': 2, 'FS': 2}

for pitch in pitch_type_dict:
    y_ord_ac = np.where(y_ord_ac == pitch, pitch_type_dict[pitch], y_ord_ac)

## Train, test, split

In [23]:
X_train_ac, X_test_ac, y_train_ac, y_test_ac = train_test_split(X_trans,
                                                                y_ord_ac,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y_ord_ac)

In [24]:
# Export train/test data
filepath = '../data/train_test_split/'

pickle_out = open(filepath + 'X_train_ac.pickle', 'wb')
pickle.dump(X_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'X_test_ac.pickle', 'wb')
pickle.dump(X_test_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_train_ac.pickle', 'wb')
pickle.dump(y_train_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_test_ac.pickle', 'wb')
pickle.dump(y_test_ac, pickle_out)
pickle_out.close()

In [26]:
# Export data for model assessment
filepath = '../data/model/'

pickle_out = open(filepath + 'X_trans.pickle', 'wb')
pickle.dump(X_trans, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'y_trans.pickle', 'wb')
pickle.dump(y_ord_ac, pickle_out)
pickle_out.close()

pickle_out = open(filepath + 'pitcher_ids.pickle', 'wb')
pickle.dump(pitcher_ids, pickle_out)
pickle_out.close()