# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [2]:
pitches = pd.read_csv('../../data/processed/corbin/corbin_cleaned.csv', 
                      index_col=0)
corbin = pitches.copy()

In [3]:
corbin.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [4]:
corbin = corbin[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [5]:
corbin = corbin.sort_values(by='sv_id')

In [6]:
corbin['pitch'] = 1

In [7]:
corbin

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
3298,FF,2019-03-31,190331_173531,607043,1,90.9,13,L,WSH,0,...,1,2164.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1
3297,FF,2019-03-31,190331_173544,607043,2,90.5,5,L,WSH,0,...,1,2238.0,0,0,Standard,Standard,1,1_0,fb:0_sb:0_tb:0,1
3296,SL,2019-03-31,190331_173559,607043,3,80.3,5,L,WSH,0,...,1,2300.0,0,0,Standard,Standard,1,1_1,fb:0_sb:0_tb:0,1
3295,FF,2019-03-31,190331_173625,607043,4,93.0,7,L,WSH,0,...,1,2241.0,0,0,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1
3294,FF,2019-03-31,190331_173656,624413,1,92.9,5,R,WSH,0,...,1,2241.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FT,2019-09-28,190928_215241,614177,1,93.6,14,R,WSH,0,...,5,2287.0,4,10,Standard,Standard,1,0_0,fb:1_sb:0_tb:0,1
3,FF,2019-09-28,190928_215257,614177,2,93.8,13,R,WSH,0,...,5,2353.0,4,10,Standard,Standard,1,1_0,fb:1_sb:0_tb:0,1
2,CH,2019-09-28,190928_215315,614177,3,84.0,14,R,WSH,0,...,5,1700.0,4,10,Standard,Standard,1,1_1,fb:1_sb:0_tb:0,1
1,SL,2019-09-28,190928_215338,614177,4,83.1,14,R,WSH,0,...,5,2538.0,4,10,Standard,Standard,1,2_1,fb:1_sb:0_tb:0,1


## Create running total of pitches thrown in the season

In [8]:
corbin['pitch_season'] = corbin.pitch.cumsum() - 1

## Create running total of pitches thrown in each game

In [9]:
corbin['pitch_game'] = corbin.groupby(['game_date'])['pitch'].cumsum() - 1

In [10]:
# corbin.loc[corbin.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [11]:
corbin['pitch_bat_gm'] = corbin.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

In [12]:
corbin.pitch_number = corbin.pitch_number - 1

In [13]:
corbin.loc[corbin.batter == 607043]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch,pitch_season,pitch_game,pitch_bat_gm
3298,FF,2019-03-31,190331_173531,607043,0,90.9,13,L,WSH,0,...,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1,0,0,0
3297,FF,2019-03-31,190331_173544,607043,1,90.5,5,L,WSH,0,...,0,Standard,Standard,1,1_0,fb:0_sb:0_tb:0,1,1,1,1
3296,SL,2019-03-31,190331_173559,607043,2,80.3,5,L,WSH,0,...,0,Standard,Standard,1,1_1,fb:0_sb:0_tb:0,1,2,2,2
3295,FF,2019-03-31,190331_173625,607043,3,93.0,7,L,WSH,0,...,0,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1,3,3,3
3269,SL,2019-03-31,190331_181336,607043,0,78.3,13,L,WSH,0,...,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1,29,29,4
3268,FF,2019-03-31,190331_181400,607043,1,91.1,5,L,WSH,0,...,0,Standard,Standard,1,1_0,fb:0_sb:0_tb:0,1,30,30,5
3240,FF,2019-03-31,190331_185053,607043,0,89.6,14,L,WSH,0,...,3,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1,58,58,6
3239,FF,2019-03-31,190331_185105,607043,1,90.5,7,L,WSH,0,...,3,Standard,Standard,1,1_0,fb:0_sb:0_tb:0,1,59,59,7
3238,FF,2019-03-31,190331_185121,607043,2,89.3,2,L,WSH,0,...,3,Standard,Standard,1,2_0,fb:0_sb:0_tb:0,1,60,60,8
3237,FF,2019-03-31,190331_185149,607043,3,91.0,4,L,WSH,0,...,3,Standard,Standard,1,2_1,fb:0_sb:0_tb:0,1,61,61,9


## Reorder columns

In [14]:
corbin = corbin.drop(columns = ['pitch'])

In [15]:
corbin.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [16]:
corbin[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
3298,FF,2019-03-31,190331_173531,607043,0,0,0,0,90.9,13,...,0,1,2164.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0
3297,FF,2019-03-31,190331_173544,607043,1,1,1,1,90.5,5,...,0,1,2238.0,0,0,Standard,Standard,1,1_0,fb:0_sb:0_tb:0
3296,SL,2019-03-31,190331_173559,607043,2,2,2,2,80.3,5,...,0,1,2300.0,0,0,Standard,Standard,1,1_1,fb:0_sb:0_tb:0
3295,FF,2019-03-31,190331_173625,607043,3,3,3,3,93.0,7,...,0,1,2241.0,0,0,Standard,Standard,1,1_2,fb:0_sb:0_tb:0
3294,FF,2019-03-31,190331_173656,624413,0,0,4,4,92.9,5,...,1,1,2241.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FT,2019-09-28,190928_215241,614177,0,4,89,3294,93.6,14,...,1,5,2287.0,4,10,Standard,Standard,1,0_0,fb:1_sb:0_tb:0
3,FF,2019-09-28,190928_215257,614177,1,5,90,3295,93.8,13,...,1,5,2353.0,4,10,Standard,Standard,1,1_0,fb:1_sb:0_tb:0
2,CH,2019-09-28,190928_215315,614177,2,6,91,3296,84.0,14,...,1,5,1700.0,4,10,Standard,Standard,1,1_1,fb:1_sb:0_tb:0
1,SL,2019-09-28,190928_215338,614177,3,7,92,3297,83.1,14,...,1,5,2538.0,4,10,Standard,Standard,1,2_1,fb:1_sb:0_tb:0


In [17]:
outfile = open('../../data/processed/corbin/corbin_clean_new_features.pickle','wb')
pickle.dump(corbin ,outfile)
outfile.close()