# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [2]:
pitches = pd.read_csv('../../data/processed/ross/ross_cleaned.csv', 
                      index_col=0)
ross = pitches.copy()

In [3]:
ross.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [4]:
ross = ross[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [5]:
ross = ross.sort_values(by='sv_id')

In [6]:
ross['pitch'] = 1

In [7]:
ross

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
1137,FF,2019-04-07,190407_203550,643446,1,95.9,8,L,NYM,0,...,9,2249.0,6,12,Standard,Standard,0,0_0,fb:0_sb:0_tb:0,1
1136,FF,2019-04-07,190407_203609,643446,2,95.3,3,L,NYM,0,...,9,2215.0,6,12,Standard,Standard,0,0_1,fb:0_sb:0_tb:0,1
1135,SL,2019-04-07,190407_203643,643446,3,88.4,14,L,NYM,0,...,9,1906.0,6,12,Standard,Standard,0,0_2,fb:0_sb:0_tb:0,1
1134,FF,2019-04-07,190407_203706,643446,4,94.4,11,L,NYM,0,...,9,2085.0,6,12,Standard,Standard,0,1_2,fb:0_sb:0_tb:0,1
1133,SL,2019-04-07,190407_203726,643446,5,88.2,14,L,NYM,0,...,9,2075.0,6,12,Standard,Standard,0,2_2,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-29,190929_204035,502273,4,94.7,3,L,WSH,0,...,6,1979.0,1,3,Infield shift,Standard,1,1_2,fb:0_sb:0_tb:0,1
3,SI,2019-09-29,190929_204117,644374,1,93.4,12,R,WSH,0,...,6,1958.0,1,3,Standard,Standard,1,0_0,fb:1_sb:0_tb:0,1
2,SL,2019-09-29,190929_204138,644374,2,86.3,1,R,WSH,0,...,6,1775.0,1,3,Standard,Standard,1,1_0,fb:1_sb:0_tb:0,1
1,SI,2019-09-29,190929_204203,644374,3,92.9,14,R,WSH,0,...,6,1998.0,1,3,Standard,Standard,1,2_0,fb:1_sb:0_tb:0,1


## Create running total of pitches thrown in the season

In [8]:
ross['pitch_season'] = ross.pitch.cumsum() - 1

## Create running total of pitches thrown in each game

In [9]:
ross['pitch_game'] = ross.groupby(['game_date'])['pitch'].cumsum() - 1

In [10]:
# ross.loc[ross.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [11]:
ross['pitch_bat_gm'] = ross.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

In [12]:
ross.pitch_number = ross.pitch_number - 1

In [13]:
ross.loc[ross.batter == 607043]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch,pitch_season,pitch_game,pitch_bat_gm
208,FF,2019-09-02,190902_172535,607043,0,94.0,2,L,WSH,0,...,0,Standard,Standard,1,0_0,fb:1_sb:0_tb:0,1,929,18,0
207,FF,2019-09-02,190902_172607,607043,1,94.4,12,L,WSH,0,...,0,Standard,Standard,1,0_1,fb:1_sb:0_tb:0,1,930,19,1
206,FF,2019-09-02,190902_172645,607043,2,94.2,12,L,WSH,0,...,0,Standard,Standard,1,0_2,fb:1_sb:0_tb:0,1,931,20,2
205,SL,2019-09-02,190902_172712,607043,3,87.4,14,L,WSH,0,...,0,Standard,Standard,1,1_2,fb:1_sb:0_tb:0,1,932,21,3
204,FF,2019-09-02,190902_172748,607043,4,93.4,2,L,WSH,0,...,0,Standard,Standard,1,2_2,fb:1_sb:0_tb:0,1,933,22,4
203,CH,2019-09-02,190902_172825,607043,5,89.3,13,L,WSH,0,...,0,Standard,Standard,1,2_2,fb:1_sb:0_tb:0,1,934,23,5
202,SL,2019-09-02,190902_172859,607043,6,88.6,4,L,WSH,0,...,0,Standard,Standard,1,3_2,fb:1_sb:0_tb:0,1,935,24,6
201,SI,2019-09-02,190902_172937,607043,7,94.8,11,L,WSH,0,...,0,Standard,Standard,1,3_2,fb:1_sb:0_tb:0,1,936,25,7
200,CU,2019-09-02,190902_173032,607043,8,80.9,11,L,WSH,0,...,0,Standard,Standard,1,3_2,fb:1_sb:0_tb:0,1,937,26,8
199,FF,2019-09-02,190902_173139,607043,9,94.7,1,L,WSH,0,...,0,Standard,Standard,1,3_2,fb:1_sb:0_tb:0,1,938,27,9


## Reorder columns

In [14]:
ross = ross.drop(columns = ['pitch'])

In [15]:
ross.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [16]:
ross[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
1137,FF,2019-04-07,190407_203550,643446,0,0,0,0,95.9,8,...,0,9,2249.0,6,12,Standard,Standard,0,0_0,fb:0_sb:0_tb:0
1136,FF,2019-04-07,190407_203609,643446,1,1,1,1,95.3,3,...,0,9,2215.0,6,12,Standard,Standard,0,0_1,fb:0_sb:0_tb:0
1135,SL,2019-04-07,190407_203643,643446,2,2,2,2,88.4,14,...,0,9,1906.0,6,12,Standard,Standard,0,0_2,fb:0_sb:0_tb:0
1134,FF,2019-04-07,190407_203706,643446,3,3,3,3,94.4,11,...,0,9,2085.0,6,12,Standard,Standard,0,1_2,fb:0_sb:0_tb:0
1133,SL,2019-04-07,190407_203726,643446,4,4,4,4,88.2,14,...,0,9,2075.0,6,12,Standard,Standard,0,2_2,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-29,190929_204035,502273,3,11,73,1133,94.7,3,...,2,6,1979.0,1,3,Infield shift,Standard,1,1_2,fb:0_sb:0_tb:0
3,SI,2019-09-29,190929_204117,644374,0,6,74,1134,93.4,12,...,2,6,1958.0,1,3,Standard,Standard,1,0_0,fb:1_sb:0_tb:0
2,SL,2019-09-29,190929_204138,644374,1,7,75,1135,86.3,1,...,2,6,1775.0,1,3,Standard,Standard,1,1_0,fb:1_sb:0_tb:0
1,SI,2019-09-29,190929_204203,644374,2,8,76,1136,92.9,14,...,2,6,1998.0,1,3,Standard,Standard,1,2_0,fb:1_sb:0_tb:0


In [17]:
outfile = open('../../data/processed/ross/ross_clean_new_features.pickle','wb')
pickle.dump(ross ,outfile)
outfile.close()