# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [2]:
pitches = pd.read_csv('../../data/processed/madmax_cleaned.csv', 
                      index_col=0)
madmax = pitches.copy()

In [3]:
madmax.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [4]:
madmax = madmax[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [5]:
madmax = madmax.sort_values(by='sv_id')

In [6]:
madmax['pitch'] = 1

In [7]:
madmax

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
2769,FF,2019-03-28,190328_170717,607043,1,93.7,6.0,L,WSH,0,...,1,2584.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1
2768,FF,2019-03-28,190328_170732,607043,2,94.2,5.0,L,WSH,0,...,1,2545.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0,1
2767,FF,2019-03-28,190328_170752,607043,3,96.3,5.0,L,WSH,0,...,1,2738.0,0,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0,1
2766,SL,2019-03-28,190328_170825,624413,1,85.6,6.0,R,WSH,0,...,1,2370.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1
2765,FF,2019-03-28,190328_170842,624413,2,95.5,12.0,R,WSH,0,...,1,2632.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,CU,2019-09-24,190925_005705,664068,2,79.1,4.0,R,WSH,0,...,6,2818.0,4,2,Standard,Standard,1,0_1,fb:0_sb:0_tb:0,1
3,SL,2019-09-24,190925_005726,664068,3,84.8,14.0,R,WSH,0,...,6,2295.0,4,2,Standard,Standard,1,0_2,fb:0_sb:0_tb:0,1
2,FF,2019-09-24,190925_005750,664068,4,97.4,8.0,R,WSH,0,...,6,2465.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1
1,CH,2019-09-24,190925_005823,664068,5,84.4,9.0,R,WSH,0,...,6,1494.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1


## Create running total of pitches thrown in the season

In [8]:
madmax['pitch_season'] = madmax.pitch.cumsum() - 1

## Create running total of pitches thrown in each game

In [9]:
madmax['pitch_game'] = madmax.groupby(['game_date'])['pitch'].cumsum() - 1

In [10]:
# madmax.loc[madmax.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [11]:
madmax['pitch_bat_gm'] = madmax.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

In [12]:
madmax.pitch_number = madmax.pitch_number - 1

In [13]:
madmax.loc[madmax.batter == 607043]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch,pitch_season,pitch_game,pitch_bat_gm
2769,FF,2019-03-28,190328_170717,607043,0,93.7,6.0,L,WSH,0,...,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1,0,0,0
2768,FF,2019-03-28,190328_170732,607043,1,94.2,5.0,L,WSH,0,...,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0,1,1,1,1
2767,FF,2019-03-28,190328_170752,607043,2,96.3,5.0,L,WSH,0,...,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0,1,2,2,2
2736,FF,2019-03-28,190328_174239,607043,0,96.5,14.0,L,WSH,0,...,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1,33,33,3
2735,CU,2019-03-28,190328_174257,607043,1,79.5,11.0,L,WSH,0,...,0,Infield shift,Standard,1,1_0,fb:0_sb:0_tb:0,1,34,34,4
2734,FF,2019-03-28,190328_174315,607043,2,96.3,11.0,L,WSH,0,...,0,Infield shift,Standard,1,2_0,fb:0_sb:0_tb:0,1,35,35,5
2733,FF,2019-03-28,190328_174331,607043,3,94.0,7.0,L,WSH,0,...,0,Infield shift,Standard,1,3_0,fb:0_sb:0_tb:0,1,36,36,6
2732,CH,2019-03-28,190328_174350,607043,4,85.8,7.0,L,WSH,0,...,0,Infield shift,Standard,1,3_1,fb:0_sb:0_tb:0,1,37,37,7
2731,FC,2019-03-28,190328_174416,607043,5,91.8,14.0,L,WSH,0,...,0,Infield shift,Standard,1,3_2,fb:0_sb:0_tb:0,1,38,38,8
2702,CH,2019-03-28,190328_182744,607043,0,83.8,13.0,L,WSH,0,...,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0,1,67,67,9


## Reorder columns

In [14]:
madmax = madmax.drop(columns = ['pitch'])

In [15]:
madmax.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [16]:
madmax[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
2769,FF,2019-03-28,190328_170717,607043,0,0,0,0,93.7,6.0,...,0,1,2584.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0
2768,FF,2019-03-28,190328_170732,607043,1,1,1,1,94.2,5.0,...,0,1,2545.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0
2767,FF,2019-03-28,190328_170752,607043,2,2,2,2,96.3,5.0,...,0,1,2738.0,0,0,Infield shift,Standard,1,0_2,fb:0_sb:0_tb:0
2766,SL,2019-03-28,190328_170825,624413,0,0,3,3,85.6,6.0,...,1,1,2370.0,0,0,Infield shift,Standard,1,0_0,fb:0_sb:0_tb:0
2765,FF,2019-03-28,190328_170842,624413,1,1,4,4,95.5,12.0,...,1,1,2632.0,0,0,Infield shift,Standard,1,0_1,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,CU,2019-09-24,190925_005705,664068,1,11,96,2765,79.1,4.0,...,2,6,2818.0,4,2,Standard,Standard,1,0_1,fb:0_sb:0_tb:0
3,SL,2019-09-24,190925_005726,664068,2,12,97,2766,84.8,14.0,...,2,6,2295.0,4,2,Standard,Standard,1,0_2,fb:0_sb:0_tb:0
2,FF,2019-09-24,190925_005750,664068,3,13,98,2767,97.4,8.0,...,2,6,2465.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0
1,CH,2019-09-24,190925_005823,664068,4,14,99,2768,84.4,9.0,...,2,6,1494.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0


In [17]:
outfile = open('../../data/processed/scherzer/scherzer_clean_new_features.pickle','wb')
pickle.dump(madmax ,outfile)
outfile.close()