# Step01b: Feature Engineering
In this notebook, we create new features.

## Import packages

In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import pickle
%matplotlib inline

In [53]:
pitches = pd.read_csv('../../data/processed/scherzer/madmax_cleaned_2015_to_2019.csv', 
                      index_col=0)
madmax = pitches.copy()

In [54]:
madmax.columns

Index(['pitch_type', 'game_date', 'batter', 'release_speed', 'zone', 'stand',
       'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'sv_id', 'release_spin_rate', 'pitch_number', 'opp_score', 'nats_score',
       'if_fielding_alignment', 'of_fielding_alignment', 'nats_home1_away0',
       'balls_strikes', 'all_runners'],
      dtype='object')

In [55]:
madmax = madmax[['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number', 'release_speed', 'zone', 
                 'stand', 'home_team', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
                 'release_spin_rate', 'opp_score', 'nats_score','if_fielding_alignment', 
                 'of_fielding_alignment', 'nats_home1_away0', 'balls_strikes', 'all_runners']]

In [56]:
madmax.game_date = pd.to_datetime(madmax.game_date)

In [60]:
madmax = madmax.sort_values(by=['game_date', 'sv_id'])

In [61]:
madmax['pitch'] = 1

In [62]:
madmax

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,release_speed,zone,stand,home_team,on_3b,...,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners,pitch
16295,FF,2015-04-06,150406_160913,434158,1,91.2,13.0,L,WSH,0,...,1,2338.0,0,0,Strategic,Standard,1,0_0,fb:0_sb:0_tb:0,1
16294,FF,2015-04-06,150406_160925,434158,2,94.2,11.0,L,WSH,0,...,1,2613.0,0,0,Strategic,Standard,1,0_1,fb:0_sb:0_tb:0,1
16293,FF,2015-04-06,150406_160940,434158,3,95.6,11.0,L,WSH,0,...,1,2639.0,0,0,Strategic,Standard,1,0_2,fb:0_sb:0_tb:0,1
16292,CH,2015-04-06,150406_160954,434158,4,87.2,14.0,L,WSH,0,...,1,1765.0,0,0,Strategic,Standard,1,1_2,fb:0_sb:0_tb:0,1
16291,FF,2015-04-06,150406_161010,434158,5,94.7,11.0,L,WSH,0,...,1,2486.0,0,0,Strategic,Standard,1,2_2,fb:0_sb:0_tb:0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,CU,2019-09-24,190925_005705,664068,2,79.1,4.0,R,WSH,0,...,6,2818.0,4,2,Standard,Standard,1,0_1,fb:0_sb:0_tb:0,1
3,SL,2019-09-24,190925_005726,664068,3,84.8,14.0,R,WSH,0,...,6,2295.0,4,2,Standard,Standard,1,0_2,fb:0_sb:0_tb:0,1
2,FF,2019-09-24,190925_005750,664068,4,97.4,8.0,R,WSH,0,...,6,2465.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1
1,CH,2019-09-24,190925_005823,664068,5,84.4,9.0,R,WSH,0,...,6,1494.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0,1


In [63]:
# madmax.info();

# drop rows with not pitch_type, disregard other nulls as these are addressed later on.

In [64]:
madmax = madmax.loc[madmax.pitch_type.isnull() == False]

## Create running total of pitches thrown in the season

In [65]:
madmax['pitch_season'] = madmax.groupby(madmax.game_date.dt.year).pitch.cumsum() - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Create running total of pitches thrown in each game

In [66]:
madmax['pitch_game'] = madmax.groupby(['game_date'])['pitch'].cumsum() - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [67]:
# madmax.loc[madmax.game_date == '2019-03-28']

## Create running total of pitches thrown to each batter per game

In [68]:
madmax['pitch_bat_gm'] = madmax.groupby(['game_date', 'batter'])['pitch'].cumsum() - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [69]:
madmax.pitch_number = madmax.pitch_number - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [70]:
madmax[['pitch_type', 'game_date','batter', 'pitch_number', 'pitch_bat_gm']].loc[madmax.batter == 607043]

Unnamed: 0,pitch_type,game_date,batter,pitch_number,pitch_bat_gm
11238,FF,2016-06-29,607043,0,0
11237,FF,2016-06-29,607043,1,1
11236,CH,2016-06-29,607043,2,2
11235,FC,2016-06-29,607043,3,3
11197,CU,2016-06-29,607043,0,4
...,...,...,...,...,...
417,CU,2019-09-03,607043,1,12
416,FC,2019-09-03,607043,2,13
415,FF,2019-09-03,607043,3,14
414,FC,2019-09-03,607043,4,15


## Reorder columns

In [71]:
madmax = madmax.drop(columns = ['pitch'])

In [72]:
madmax.columns

Index(['pitch_type', 'game_date', 'sv_id', 'batter', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm'],
      dtype='object')

In [73]:
madmax[['pitch_type', 'game_date', 'sv_id', 'batter', 
        'pitch_number', 'pitch_bat_gm', 'pitch_game', 'pitch_season', 
        'release_speed', 'zone', 'stand', 'home_team', 
        'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 
        'release_spin_rate', 'opp_score', 'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
        'nats_home1_away0', 'balls_strikes', 'all_runners']]

Unnamed: 0,pitch_type,game_date,sv_id,batter,pitch_number,pitch_bat_gm,pitch_game,pitch_season,release_speed,zone,...,outs_when_up,inning,release_spin_rate,opp_score,nats_score,if_fielding_alignment,of_fielding_alignment,nats_home1_away0,balls_strikes,all_runners
16295,FF,2015-04-06,150406_160913,434158,0,0,0,0,91.2,13.0,...,0,1,2338.0,0,0,Strategic,Standard,1,0_0,fb:0_sb:0_tb:0
16294,FF,2015-04-06,150406_160925,434158,1,1,1,1,94.2,11.0,...,0,1,2613.0,0,0,Strategic,Standard,1,0_1,fb:0_sb:0_tb:0
16293,FF,2015-04-06,150406_160940,434158,2,2,2,2,95.6,11.0,...,0,1,2639.0,0,0,Strategic,Standard,1,0_2,fb:0_sb:0_tb:0
16292,CH,2015-04-06,150406_160954,434158,3,3,3,3,87.2,14.0,...,0,1,1765.0,0,0,Strategic,Standard,1,1_2,fb:0_sb:0_tb:0
16291,FF,2015-04-06,150406_161010,434158,4,4,4,4,94.7,11.0,...,0,1,2486.0,0,0,Strategic,Standard,1,2_2,fb:0_sb:0_tb:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,CU,2019-09-24,190925_005705,664068,1,11,96,2765,79.1,4.0,...,2,6,2818.0,4,2,Standard,Standard,1,0_1,fb:0_sb:0_tb:0
3,SL,2019-09-24,190925_005726,664068,2,12,97,2766,84.8,14.0,...,2,6,2295.0,4,2,Standard,Standard,1,0_2,fb:0_sb:0_tb:0
2,FF,2019-09-24,190925_005750,664068,3,13,98,2767,97.4,8.0,...,2,6,2465.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0
1,CH,2019-09-24,190925_005823,664068,4,14,99,2768,84.4,9.0,...,2,6,1494.0,4,2,Standard,Standard,1,1_2,fb:0_sb:0_tb:0


In [74]:
outfile = open('../../data/processed/scherzer/scherzer_clean_new_features_2015_to_2019.pickle','wb')
pickle.dump(madmax ,outfile)
outfile.close()