# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [2]:
pitches = pd.read_csv('../../data/processed/sanchez/sanchez_cleaned.csv', 
                      index_col=0)
sanchez = pitches.copy()

In [3]:
sanchez.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [4]:
sanchez = sanchez[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [5]:
sanchez = sanchez.sort_values(by='sv_id')

In [6]:
sanchez['pitch'] = 1

In [7]:
sanchez

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
2689,FF,2019-04-03,190403_170525,457705,1,89.1,14.0,R,WSH,0,...,1,2207.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1
2688,FC,2019-04-03,190403_170539,457705,2,86.2,8.0,R,WSH,0,...,1,2402.0,0,0,Standard,Standard,1,0_1,fb:0_sb:0_tb:0,1
2687,FF,2019-04-03,190403_170602,457705,3,91.2,12.0,R,WSH,0,...,1,2269.0,0,0,Standard,Standard,1,0_2,fb:0_sb:0_tb:0,1
2686,FC,2019-04-03,190403_170619,457705,4,88.5,7.0,R,WSH,0,...,1,2361.0,0,0,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1
2685,FF,2019-04-03,190403_170648,516416,1,91.6,9.0,R,WSH,0,...,1,2241.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-25,190926_004925,595284,2,88.6,7.0,L,WSH,0,...,7,2324.0,2,1,Infield shift,Standard,1,0_1,fb:1_sb:0_tb:0,1
3,CH,2019-09-25,190926_004953,595284,3,71.7,13.0,L,WSH,0,...,7,,2,1,Infield shift,Standard,1,0_2,fb:0_sb:1_tb:0,1
2,FS,2019-09-25,190926_005029,595284,4,83.7,13.0,L,WSH,0,...,7,1736.0,2,1,Infield shift,Standard,1,1_2,fb:0_sb:1_tb:0,1
1,FC,2019-09-25,190926_005103,595284,5,90.3,11.0,L,WSH,0,...,7,2220.0,2,1,Infield shift,Standard,1,2_2,fb:0_sb:1_tb:0,1


## Create running total of pitches thrown in the season

In [8]:
sanchez['pitch_season'] = sanchez.pitch.cumsum() - 1

## Create running total of pitches thrown in each game

In [9]:
sanchez['pitch_game'] = sanchez.groupby(['game_date'])['pitch'].cumsum() - 1

In [10]:
# sanchez.loc[sanchez.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [11]:
sanchez['pitch_bat_gm'] = sanchez.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

In [12]:
sanchez.pitch_number = sanchez.pitch_number - 1

In [13]:
sanchez.loc[sanchez.batter == 607043]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch,pitch_season,pitch_game,pitch_bat_gm
1928,FF,2019-05-16,190516_173622,607043,0,89.3,6.0,L,WSH,0,...,4,Standard,Standard,1,0_0,fb:1_sb:0_tb:0,1,761,23,0
1927,FF,2019-05-16,190516_173700,607043,1,88.3,12.0,L,WSH,0,...,4,Standard,Standard,1,0_1,fb:1_sb:0_tb:0,1,762,24,1
1926,FF,2019-05-16,190516_173734,607043,2,89.6,6.0,L,WSH,0,...,4,Standard,Standard,1,1_1,fb:1_sb:0_tb:0,1,763,25,2
1925,FF,2019-05-16,190516_173805,607043,3,91.3,5.0,L,WSH,0,...,4,Infield shift,Standard,1,1_2,fb:1_sb:0_tb:0,1,764,26,3
1924,FS,2019-05-16,190516_173842,607043,4,83.7,13.0,L,WSH,0,...,4,Infield shift,Standard,1,1_2,fb:1_sb:0_tb:0,1,765,27,4
1923,FF,2019-05-16,190516_173904,607043,5,89.2,13.0,L,WSH,0,...,4,Infield shift,Standard,1,2_2,fb:1_sb:0_tb:0,1,766,28,5
1922,FS,2019-05-16,190516_173929,607043,6,84.8,4.0,L,WSH,0,...,4,Infield shift,Standard,1,3_2,fb:1_sb:0_tb:0,1,767,29,6
1921,FC,2019-05-16,190516_174007,607043,7,87.8,11.0,L,WSH,0,...,4,Infield shift,Standard,1,3_2,fb:1_sb:0_tb:0,1,768,30,7


## Reorder columns

In [14]:
sanchez = sanchez.drop(columns = ['pitch'])

In [15]:
sanchez.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [16]:
sanchez[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
2689,FF,2019-04-03,190403_170525,457705,0,0,0,0,89.1,14.0,...,0,1,2207.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0
2688,FC,2019-04-03,190403_170539,457705,1,1,1,1,86.2,8.0,...,0,1,2402.0,0,0,Standard,Standard,1,0_1,fb:0_sb:0_tb:0
2687,FF,2019-04-03,190403_170602,457705,2,2,2,2,91.2,12.0,...,0,1,2269.0,0,0,Standard,Standard,1,0_2,fb:0_sb:0_tb:0
2686,FC,2019-04-03,190403_170619,457705,3,3,3,3,88.5,7.0,...,0,1,2361.0,0,0,Standard,Standard,1,1_2,fb:0_sb:0_tb:0
2685,FF,2019-04-03,190403_170648,516416,0,0,4,4,91.6,9.0,...,1,1,2241.0,0,0,Standard,Standard,1,0_0,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,FF,2019-09-25,190926_004925,595284,1,8,85,2685,88.6,7.0,...,2,7,2324.0,2,1,Infield shift,Standard,1,0_1,fb:1_sb:0_tb:0
3,CH,2019-09-25,190926_004953,595284,2,9,86,2686,71.7,13.0,...,2,7,,2,1,Infield shift,Standard,1,0_2,fb:0_sb:1_tb:0
2,FS,2019-09-25,190926_005029,595284,3,10,87,2687,83.7,13.0,...,2,7,1736.0,2,1,Infield shift,Standard,1,1_2,fb:0_sb:1_tb:0
1,FC,2019-09-25,190926_005103,595284,4,11,88,2688,90.3,11.0,...,2,7,2220.0,2,1,Infield shift,Standard,1,2_2,fb:0_sb:1_tb:0


In [17]:
outfile = open('../../data/processed/sanchez/sanchez_clean_new_features.pickle','wb')
pickle.dump(sanchez ,outfile)
outfile.close()