# loading in the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../mats/2023-nwbc-reds.csv')

# Checking data characteristics

In [3]:
df.shape

(49038, 95)

In [4]:
sorted(df.columns)

['at_bat_number',
 'attack_zone',
 'away_score',
 'away_team',
 'ax',
 'ay',
 'az',
 'babip_value',
 'balls',
 'bat_score',
 'batter',
 'batter_name',
 'bb_type',
 'break_angle_deprecated',
 'break_length_deprecated',
 'delta_home_win_exp',
 'delta_run_exp',
 'des',
 'description',
 'effective_speed',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'events',
 'fielder_2',
 'fielder_2.1',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'fld_score',
 'game_date',
 'game_pk',
 'game_type',
 'game_year',
 'hc_x',
 'hc_y',
 'hit_distance_sc',
 'hit_location',
 'home_score',
 'home_team',
 'if_fielding_alignment',
 'inning',
 'inning_topbot',
 'iso_value',
 'launch_angle',
 'launch_speed',
 'launch_speed_angle',
 'of_fielding_alignment',
 'on_1b',
 'on_2b',
 'on_3b',
 'outs_when_up',
 'p_throws',
 'pfx_x',
 'pfx_z',
 'pitch_name',
 'pitch_number',
 'pitch_type',
 'pitcher',
 'pitcher.1',
 'pitcher_name',
 'plate_x',
 '

In [5]:
df.describe()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,batter,pitcher,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,...,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,spin_axis,delta_home_win_exp,delta_run_exp,swing
count,48004.0,48005.0,48005.0,49038.0,49038.0,0.0,0.0,0.0,0.0,48005.0,...,49038.0,49038.0,49038.0,49038.0,49038.0,49038.0,47245.0,49038.0,47990.0,49038.0
mean,89.489322,-0.9373,5.771255,603472.688201,617183.213487,,,,,9.072451,...,2.220604,2.222827,2.22295,2.251988,2.252111,2.222827,175.738724,9.8e-05,0.00237,0.0
std,6.01186,1.871269,0.47003,67092.014911,61535.933227,,,,,4.227762,...,2.540002,2.611473,2.539729,2.626396,2.555069,2.611473,72.048866,0.027262,0.242729,0.0
min,39.1,-4.62,1.14,405395.0,424144.0,,,,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.697,-1.11,0.0
25%,85.1,-2.22,5.53,571657.0,592716.0,,,,,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,122.0,0.0,-0.066,0.0
50%,90.3,-1.57,5.82,608703.0,641816.0,,,,,11.0,...,1.0,1.0,1.0,2.0,2.0,1.0,204.0,0.0,-0.017,0.0
75%,94.3,-0.63,6.05,663697.0,665665.0,,,,,13.0,...,3.0,3.0,3.0,3.0,3.0,3.0,224.0,0.0,0.033,0.0
max,103.4,4.46,7.53,703715.0,699479.0,,,,,14.0,...,20.0,20.0,15.0,20.0,20.0,20.0,360.0,0.858,3.354,0.0


In [6]:
df['total_score'] = df['home_score'] + df['away_score']
df.head(1)

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,attack_zone,swing,batter_name,total_score
0,FF,2022-10-05,93.8,2.39,5.44,"Hughes, Brandon",605244,676714,field_out,hit_into_play,...,15,Standard,Standard,133.0,0.0,-0.083,heart,0,"garcia, aramis",17


In [7]:
df['description'].unique()

array(['hit_into_play', 'ball', 'called_strike', 'swinging_strike',
       'foul', 'blocked_ball', 'swinging_strike_blocked', 'hit_by_pitch',
       'foul_tip', 'foul_bunt', 'missed_bunt', 'bunt_foul_tip',
       'pitchout'], dtype=object)

# feature engineering
creating new columns

In [8]:
strike_events = ['called_strike', 'swinging_strike',
       'foul', 'blocked_ball', 'swinging_strike_blocked', 'hit_by_pitch',
       'foul_tip', 'foul_bunt', 'missed_bunt', 'bunt_foul_tip']

In [9]:
df['is_strike'] = 0
df.loc[df['description'].isin(strike_events), 'is_strike'] = 1

In [10]:
df['is_strike'].value_counts()

0    24800
1    24238
Name: is_strike, dtype: int64

In [11]:
df['is_2_strikes'] = 0
df.loc[df['strikes'] == 2, 'is_2_strikes'] = 1

In [12]:
df['is_high_velo'] = 0
df.loc[df['release_speed'] > 94, 'is_high_velo'] = 1

In [13]:
strike_list =  ['called_strike', 'swinging_strike']
df['strike_heart'] = 0
df.loc[(df['description'].isin(strike_list)) & (df['attack_zone'] == 'heart'), 'strike_heart'] = 1
df.iloc[:, -4:].head(5)

Unnamed: 0,is_strike,is_2_strikes,is_high_velo,strike_heart
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,1,0,0,1
4,1,0,1,1


In [29]:
#converted original to a function, so we can get info on other teams if needed
def team_find(df, team):
    _df = df
    _df[f'is_{team}_pitcher'] = 0
    _df.loc[((_df['home_team'] == f'{team}')
        & (_df['inning_topbot'] == 'Top')) 
        | ((_df['away_team'] == f'{team}') 
        & (_df['inning_topbot'] == 'Bot'))
        , f'is_{team}_pitcher'] = 1
    return _df.loc[df[f'is_{team}_pitcher'] == 1, ['pitcher_name', 'pitch_name','is_strike']].groupby(['pitcher_name', 'pitch_name'], as_index = False).mean()

In [26]:
df['home_team'].unique()

array(['CIN', 'CHC', 'PIT', 'STL', 'MIL', 'WSH', 'PHI', 'NYM', 'MIA',
       'NYY', 'SF', 'AZ', 'BOS', 'TOR', 'CLE', 'COL', 'SD', 'LAD', 'ATL',
       'LAA'], dtype=object)

# Reds pitcher data frame

In [31]:
team_find(df, 'CIN')

Unnamed: 0,pitcher_name,pitch_name,is_strike
0,"Anderson, Chase",4-Seam Fastball,0.437500
1,"Anderson, Chase",Changeup,0.578571
2,"Anderson, Chase",Curveball,0.447368
3,"Anderson, Chase",Cutter,0.483146
4,"Anderson, Chase",Sinker,0.500000
...,...,...,...
141,"Zeuch, T.J.",Slider,0.388060
142,"Zimmer, Kyle",4-Seam Fastball,0.400000
143,"Zimmer, Kyle",Changeup,1.000000
144,"Zimmer, Kyle",Curveball,1.000000
