# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [2]:
pitches = pd.read_csv('../../data/processed/strasburg/stras_cleaned.csv', 
                      index_col=0)
stras = pitches.copy()

In [3]:
stras.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [4]:
stras = stras[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [5]:
stras = stras.sort_values(by='sv_id')

In [6]:
stras['pitch'] = 1

In [7]:
stras

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
3383,FF,2019-03-30,190330_170528,607043,1,93.5,4.0,L,WSH,0,...,1,2082.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1
3382,FF,2019-03-30,190330_170545,607043,2,94.1,2.0,L,WSH,0,...,1,2171.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0,1
3381,FF,2019-03-30,190330_170605,607043,3,94.0,11.0,L,WSH,0,...,1,2194.0,0,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0,1
3380,CH,2019-03-30,190330_170626,607043,4,88.2,13.0,L,WSH,0,...,1,1874.0,0,0,Infield shift,Standard,1,1_2,fb:0_sb:0_tb:0,1
3379,CU,2019-03-30,190330_170700,624413,1,80.4,3.0,R,WSH,0,...,1,2717.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-26,190926_221702,547180,2,94.7,8.0,L,WSH,0,...,6,2143.0,1,5,Standard,Standard,1,1_0,fb:0_sb:1_tb:0,1
3,FT,2019-09-26,190926_221737,547180,3,95.2,4.0,L,WSH,0,...,6,2111.0,1,5,Standard,Standard,1,1_1,fb:0_sb:1_tb:0,1
2,CH,2019-09-26,190926_221816,547180,4,88.6,13.0,L,WSH,0,...,6,1596.0,1,5,Standard,Standard,1,1_2,fb:0_sb:1_tb:0,1
1,CH,2019-09-26,190926_221852,547180,5,89.5,14.0,L,WSH,0,...,6,1623.0,1,5,Standard,Standard,1,2_2,fb:0_sb:1_tb:0,1


## Create running total of pitches thrown in the season

In [8]:
stras['pitch_season'] = stras.pitch.cumsum() - 1

## Create running total of pitches thrown in each game

In [9]:
stras['pitch_game'] = stras.groupby(['game_date'])['pitch'].cumsum() - 1

In [10]:
# stras.loc[stras.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [11]:
stras['pitch_bat_gm'] = stras.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

In [12]:
stras.pitch_number = stras.pitch_number - 1

In [13]:
stras.loc[stras.batter == 607043]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch,pitch_season,pitch_game,pitch_bat_gm
3383,FF,2019-03-30,190330_170528,607043,0,93.5,4.0,L,WSH,0,...,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1,0,0,0
3382,FF,2019-03-30,190330_170545,607043,1,94.1,2.0,L,WSH,0,...,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0,1,1,1,1
3381,FF,2019-03-30,190330_170605,607043,2,94.0,11.0,L,WSH,0,...,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0,1,2,2,2
3380,CH,2019-03-30,190330_170626,607043,3,88.2,13.0,L,WSH,0,...,0,Infield shift,Standard,1,1_2,fb:0_sb:0_tb:0,1,3,3,3
3347,CU,2019-03-30,190330_173714,607043,0,79.4,14.0,L,WSH,0,...,1,Standard,Standard,1,0_0,fb:1_sb:0_tb:0,1,36,36,4
3322,FF,2019-03-30,190330_181758,607043,0,92.0,11.0,L,WSH,0,...,3,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1,61,61,5
3321,FF,2019-03-30,190330_181814,607043,1,90.4,11.0,L,WSH,0,...,3,Infield shift,Standard,1,1_0,fb:0_sb:0_tb:0,1,62,62,6
3320,CU,2019-03-30,190330_181836,607043,2,79.2,2.0,L,WSH,0,...,3,Infield shift,Standard,1,2_0,fb:0_sb:0_tb:0,1,63,63,7
3319,FF,2019-03-30,190330_181856,607043,3,93.1,8.0,L,WSH,0,...,3,Infield shift,Standard,1,2_1,fb:0_sb:0_tb:0,1,64,64,8
3318,FF,2019-03-30,190330_181916,607043,4,92.5,12.0,L,WSH,0,...,3,Infield shift,Standard,1,2_2,fb:0_sb:0_tb:0,1,65,65,9


## Reorder columns

In [14]:
stras = stras.drop(columns = ['pitch'])

In [15]:
stras.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [16]:
stras[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
3383,FF,2019-03-30,190330_170528,607043,0,0,0,0,93.5,4.0,...,0,1,2082.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0
3382,FF,2019-03-30,190330_170545,607043,1,1,1,1,94.1,2.0,...,0,1,2171.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0
3381,FF,2019-03-30,190330_170605,607043,2,2,2,2,94.0,11.0,...,0,1,2194.0,0,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0
3380,CH,2019-03-30,190330_170626,607043,3,3,3,3,88.2,13.0,...,0,1,1874.0,0,0,Infield shift,Standard,1,1_2,fb:0_sb:0_tb:0
3379,CU,2019-03-30,190330_170700,624413,0,0,4,4,80.4,3.0,...,1,1,2717.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-26,190926_221702,547180,1,14,87,3378,94.7,8.0,...,2,6,2143.0,1,5,Standard,Standard,1,1_0,fb:0_sb:1_tb:0
3,FT,2019-09-26,190926_221737,547180,2,15,88,3379,95.2,4.0,...,2,6,2111.0,1,5,Standard,Standard,1,1_1,fb:0_sb:1_tb:0
2,CH,2019-09-26,190926_221816,547180,3,16,89,3380,88.6,13.0,...,2,6,1596.0,1,5,Standard,Standard,1,1_2,fb:0_sb:1_tb:0
1,CH,2019-09-26,190926_221852,547180,4,17,90,3381,89.5,14.0,...,2,6,1623.0,1,5,Standard,Standard,1,2_2,fb:0_sb:1_tb:0


In [17]:
outfile = open('../../data/processed/strasburg/stras_clean_new_features.pickle','wb')
pickle.dump(stras ,outfile)
outfile.close()