# Dependencies

In [1]:
import pandas as pd
import numpy as np
import warnings

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
warnings.filterwarnings('ignore')

In [3]:
batting_df = pd.read_csv('lahman_data/Batting.csv')
batting_df = batting_df.loc[batting_df.yearID >= 2017]

# Batting 2020 csv adjusted to 162-game season
batting_df2020 = pd.read_csv('lahman_data/Batting2020.csv')

pitching_df = pd.read_csv('lahman_data/Pitching.csv')
pitching_df = pitching_df.loc[pitching_df.yearID >= 2017]

people_df = pd.read_csv('lahman_data/People.csv')
cols = ['playerID', 'birthYear', 'nameFirst', 'nameLast']
people_df = people_df[cols]

In [4]:
batting_df = batting_df.append(batting_df2020, ignore_index=True)

# Remove Pitchers from Batting_df

In [5]:
# Find pitchers in batting

pitchers = list(pitching_df.playerID)
mask = batting_df.loc[batting_df.playerID.isin(pitchers)].index

# Drop pitchers with at bats
batting_df = batting_df.drop(mask, axis=0).reset_index(drop=True)

## Filter to Qualifying Batters

In [6]:
batting_df['PA'] = (batting_df['AB'] + batting_df['BB'] + batting_df['SH'] + 
                       batting_df['SF'] + batting_df['HBP'])

In [7]:
# Qualifying PA = 502
batting_df = batting_df.loc[batting_df.PA >= 502]
batting_df

Unnamed: 0,playerID,yearID,stint,TeamID,lgID,G,AB,R,H,2B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA
0,abreujo02,2017,1,CHA,AL,156,621,95,189,43,...,3.0,0.0,35,119.0,6.0,15.0,0.0,4.0,21.0,675.0
17,altuvjo01,2017,1,HOU,AL,153,590,112,204,39,...,32.0,6.0,58,84.0,3.0,9.0,1.0,4.0,19.0,662.0
21,anderti01,2017,1,CHA,AL,146,587,72,151,26,...,15.0,1.0,13,162.0,0.0,3.0,2.0,1.0,13.0,606.0
22,andruel01,2017,1,TEX,AL,158,643,100,191,44,...,25.0,10.0,38,101.0,0.0,3.0,1.0,4.0,18.0,689.0
24,arciaor01,2017,1,MIL,NL,153,506,56,140,17,...,14.0,7.0,36,100.0,9.0,1.0,2.0,3.0,10.0,548.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287,walkech02,2020,1,ARI,NL,154,589,95,159,49,...,3.0,3.0,51,135.0,0.0,8.0,0.0,8.0,16.0,656.0
2294,whiteev01,2020,1,SEA,AL,146,491,51,86,19,...,3.0,5.0,49,227.0,0.0,3.0,0.0,3.0,8.0,546.0
2302,wongko01,2020,1,STL,NL,143,489,70,130,11,...,14.0,5.0,54,81.0,3.0,11.0,5.0,3.0,3.0,562.0
2304,yastrmi01,2020,1,SFG,NL,146,518,105,154,38,...,5.0,3.0,81,149.0,5.0,8.0,0.0,0.0,5.0,607.0


# Inspect Data

In [8]:
batting_df.head()

Unnamed: 0,playerID,yearID,stint,TeamID,lgID,G,AB,R,H,2B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,PA
0,abreujo02,2017,1,CHA,AL,156,621,95,189,43,...,3.0,0.0,35,119.0,6.0,15.0,0.0,4.0,21.0,675.0
17,altuvjo01,2017,1,HOU,AL,153,590,112,204,39,...,32.0,6.0,58,84.0,3.0,9.0,1.0,4.0,19.0,662.0
21,anderti01,2017,1,CHA,AL,146,587,72,151,26,...,15.0,1.0,13,162.0,0.0,3.0,2.0,1.0,13.0,606.0
22,andruel01,2017,1,TEX,AL,158,643,100,191,44,...,25.0,10.0,38,101.0,0.0,3.0,1.0,4.0,18.0,689.0
24,arciaor01,2017,1,MIL,NL,153,506,56,140,17,...,14.0,7.0,36,100.0,9.0,1.0,2.0,3.0,10.0,548.0


In [9]:
batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515 entries, 0 to 2305
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   playerID  515 non-null    object 
 1   yearID    515 non-null    int64  
 2   stint     515 non-null    int64  
 3   TeamID    515 non-null    object 
 4   lgID      515 non-null    object 
 5   G         515 non-null    int64  
 6   AB        515 non-null    int64  
 7   R         515 non-null    int64  
 8   H         515 non-null    int64  
 9   2B        515 non-null    int64  
 10  3B        515 non-null    int64  
 11  HR        515 non-null    int64  
 12  RBI       515 non-null    float64
 13  SB        515 non-null    float64
 14  CS        515 non-null    float64
 15  BB        515 non-null    int64  
 16  SO        515 non-null    float64
 17  IBB       515 non-null    float64
 18  HBP       515 non-null    float64
 19  SH        515 non-null    float64
 20  SF        515 non-null    float

In [10]:
batting_df.shape

(515, 23)

In [11]:
pitching_df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
44967,abadfe01,2017,1,BOS,AL,2,1,48,0,0,...,1.0,0,1.0,1,182.0,15,18,0.0,2.0,4.0
44968,adamsau02,2017,1,WAS,NL,0,0,6,0,0,...,0.0,1,1.0,0,29.0,3,4,0.0,1.0,0.0
44969,adlemti01,2017,1,CIN,NL,5,11,30,20,0,...,1.0,1,6.0,1,531.0,4,79,1.0,4.0,8.0
44970,alberan01,2017,1,SEA,AL,5,1,9,6,0,...,0.0,1,2.0,0,178.0,2,22,0.0,1.0,3.0
44971,alberma01,2017,1,WAS,NL,7,2,63,0,0,...,0.0,0,4.0,0,233.0,23,12,0.0,1.0,6.0


In [12]:
batting_df.shape

(515, 23)

In [13]:
batting_df.playerID.nunique()

246

# Determine Additional Stats

In [14]:
# Determine Each Players League Average

avgs = []
slgs = []

for index,row in batting_df.iterrows():
    if row.AB == 0:
        avg = 0
        slg = 0
        avgs.append(avg)
        slgs.append(slg)
    else:
        avg = (row.H)/(row.AB)
        avgs.append(avg)

        slg = ((row.H) + (row['2B']) + (2 * row['3B']) + (3 * row.HR))/(row.AB)
        slgs.append(slg)
    
batting_df['AVG'] = avgs
batting_df['SLG'] = slgs

batting_df.tail()

Unnamed: 0,playerID,yearID,stint,TeamID,lgID,G,AB,R,H,2B,...,BB,SO,IBB,HBP,SH,SF,GIDP,PA,AVG,SLG
2287,walkech02,2020,1,ARI,NL,154,589,95,159,49,...,51,135.0,0.0,8.0,0.0,8.0,16.0,656.0,0.269949,0.460102
2294,whiteev01,2020,1,SEA,AL,146,491,51,86,19,...,49,227.0,0.0,3.0,0.0,3.0,8.0,546.0,0.175153,0.348269
2302,wongko01,2020,1,STL,NL,143,489,70,130,11,...,54,81.0,3.0,11.0,5.0,3.0,3.0,562.0,0.265849,0.327198
2304,yastrmi01,2020,1,SFG,NL,146,518,105,154,38,...,81,149.0,5.0,8.0,0.0,0.0,5.0,607.0,0.297297,0.569498
2305,yelicch01,2020,1,MIL,NL,157,540,105,111,19,...,124,205.0,5.0,3.0,0.0,0.0,11.0,667.0,0.205556,0.42963


# Weight Each Season

In [15]:
def marcelBatTable(df, yr, weight, PA_weight):

    df = pd.DataFrame()

    players = []
    years = []
    ABs = []
    Runs = []
    Hits = []
    Doubles = []
    Triples = []
    HRs = []
    Rbis = []
    SBs = []
    CSs = []
    BBs = []
    SOs = []
    IBBs = []
    HBPs = []
    SHs = []
    SFs = []
    GDPs = []
    AVGs = []
    PAs = []

    for index, row in batting_df.iterrows():
        if row[1] == yr:
            player = (row[0])
            year = (row[1])
            AB = (row[6] * weight)
            R = (row[7] * weight)
            H = (row[8] * weight)
            D = (row[9] * weight)
            T = (row[10] * weight)
            HR = (row[11] * weight)
            RBI = (row[12] * weight)
            SB = (row[13] * weight)
            CS = (row[14] * weight)
            BB = (row[15] * weight)
            SO = (row[16] * weight)
            IBB = (row[17] * weight)
            HBP = (row[18] * weight)
            SH = (row[19] * weight)
            SF = (row[20] * weight)
            GDP = (row[21] * weight)
            AVG = (row[23] * weight)
            if row[1] == yr3:
                PA = (200)
            else:
                PA = (row[22] * PA_weight)

            players.append(player)
            years.append(year)
            ABs.append(AB)
            Runs.append(R)
            Hits.append(H)
            Doubles.append(D)
            Triples.append(T)
            HRs.append(HR)
            Rbis.append(RBI)
            SBs.append(SB)
            CSs.append(CS)
            BBs.append(BB)
            SOs.append(SO)
            IBBs.append(IBB)
            HBPs.append(HBP)
            SHs.append(SH)
            SFs.append(SF)
            GDPs.append(GDP)
            AVGs.append(AVG)
            PAs.append(PA)

    df['playerID'] = players
    df['yearID'] = years
    df['AB'] = ABs
    df['R'] = Runs
    df['H'] = Hits
    df['2B'] = Doubles
    df['3B'] = Triples
    df['HR'] = HRs
    df['RBI'] = Rbis
    df['SB'] = SBs
    df['CS'] = CSs
    df['BB'] = BBs
    df['SO'] = SOs
    df['IBB'] = IBBs
    df['HBP'] = HBPs
    df['SH'] = SHs
    df['SF'] = SFs
    df['GDP'] = GDPs
    df['AVG'] = AVGs
    df['w_PA'] = PAs

    return df

        

In [16]:
df_proj_2018 = pd.DataFrame()
df_proj_2019 = pd.DataFrame()
df_proj_2020 = pd.DataFrame()

yr = 2021
yr1 = 2020
yr2 = 2019
yr3 = 2018

df_proj_yr3 = marcelBatTable(df_proj_2020, 2018, 3, 1)
df_proj_yr2 = marcelBatTable(df_proj_2018, 2019, 4, 0.1)
df_proj_yr1 = marcelBatTable(df_proj_2019, 2020, 5, 0.5)


In [17]:
df_list = [df_proj_yr3, df_proj_yr2, df_proj_yr1]

#total plate appearance
for df in df_list:
    df['t_PA'] = (df['AB'] + df['BB'] + df['SH'] + df['SF'] + df['HBP'])

### ADD Aging Adjustments

In [18]:
def ageAdjustment(df):
    df = df.merge(people_df, how='left', on='playerID')
    df['Age'] = df['yearID'] - df['birthYear']
    
    ageAdjusts = []

    for index, row in df.iterrows():
        if row.Age > 29:
            ageAdj = 1/(1 + ((row.Age)*0.003))
        elif row.Age < 29:
            ageAdj = 1 + ((29 - row.Age) * 0.006)
        else:
            ageAdj = 1

        ageAdjusts.append(ageAdj)

    df['AgeAdj'] = ageAdjusts

    for stat in df[cols]:
        df[stat] = (df['AgeAdj'] * df[stat])
    
    return df
    

In [19]:
cols = ['AB','R','H','2B','3B','HR','RBI','SB','CS','BB','SO','IBB','HBP','SH','SF','GDP','AVG','w_PA','t_PA']

df_proj_yr1 = ageAdjustment(df_proj_yr1)
df_proj_yr2 = ageAdjustment(df_proj_yr2)
df_proj_yr3 = ageAdjustment(df_proj_yr3)

# Expected Home Runs

In [20]:
def getEHR(yr):
    x = batting_df.loc[batting_df.yearID == yr]

    league_average = x.mean()
    league_totals = x.sum()

    totalPA = league_totals.PA
    totalHR = league_totals.HR

    EHR = (totalHR/totalPA)
    
    return EHR

In [21]:
EHR_yr1 = getEHR(2020)
EHR_yr2 = getEHR(2019)
EHR_yr3 = getEHR(2018)

In [22]:
def addEHR(df, EHR):
    e_hrs = []
    
    for index, row in df.iterrows():
        e_hr = (EHR) * (row['t_PA'])
        e_hrs.append(e_hr)
    df['e_hr'] = e_hrs
    return df

In [23]:
df_proj_yr1 = addEHR(df_proj_yr1, EHR_yr1)
df_proj_yr2 = addEHR(df_proj_yr2, EHR_yr2)
df_proj_yr3 = addEHR(df_proj_yr3, EHR_yr3)

# Regress to mean

#### Set up DataFrame

In [24]:
# Combine 3-years data

marcel_df = pd.DataFrame()

players = []
expect_hrs = []
expect_pas = []
years = []
weight_hrs = []
weight_pas = []

df_list = [df_proj_yr3, df_proj_yr2, df_proj_yr1]

for df in df_list:
    for index, row in df.iterrows():
        guy = row.playerID
        players.append(guy)
        
        expect_hr = row['e_hr']
        expect_hrs.append(expect_hr)
        
        expect_pa = row['t_PA']
        expect_pas.append(expect_pa)
        
        year = row['yearID']
        years.append(year)
        
        weight_hr = row['HR']
        weight_hrs.append(weight_hr)
        
        weight_pa = row['w_PA']
        weight_pas.append(weight_pa)
    
        
marcel_df['playerID'] = players
marcel_df['yearID'] = years
marcel_df['e_hr'] = expect_hrs
marcel_df['e_pa'] = expect_pas
marcel_df['w_hr'] = weight_hrs
marcel_df['w_pa'] = weight_pas

marcel_df

Unnamed: 0,playerID,yearID,e_hr,e_pa,w_hr,w_pa
0,abreujo02,2018,52.844894,1517.840805,60.384263,182.982617
1,aguilje01,2018,59.471990,1708.188000,105.630000,201.200000
2,ahmedni01,2018,59.261842,1702.152000,48.288000,201.200000
3,albieoz01,2018,74.871312,2150.496000,75.456000,209.600000
4,alonsyo01,2018,54.851662,1575.480329,63.129003,182.982617
...,...,...,...,...,...,...
383,walkech02,2020,131.842207,3280.000000,95.000000,328.000000
384,whiteev01,2020,113.026555,2811.900000,113.300000,281.190000
385,wongko01,2020,103.624021,2577.981651,13.761468,257.798165
386,yastrmi01,2020,111.921318,2784.403670,123.853211,278.440367


In [25]:
marcel_df_sub = pd.DataFrame()

total_ehrs = []
total_epas = []
guys = []
weights_hrs = []
weight_pas = []

for player in marcel_df.playerID:
    
    mask = marcel_df.loc[marcel_df.playerID == player]
    
    guy = mask.playerID.values[0]
    total_epa = mask['e_pa'].sum()
    total_ehr = mask['e_hr'].sum()
    total_w_hr = mask['w_hr'].sum()
    total_w_pa = mask['w_pa'].sum()
    
    total_epas.append(total_epa)
    total_ehrs.append(total_ehr)
    weights_hrs.append(total_w_hr)
    weight_pas.append(total_w_pa)
    guys.append(guy)
    

marcel_df_sub['playerID'] = guys
marcel_df_sub['e_hr'] = total_ehrs
marcel_df_sub['e_pa'] = total_epas
marcel_df_sub['w_hr'] = weights_hrs
marcel_df_sub['w_pa'] = weight_pas

marcel_df_sub

Unnamed: 0,playerID,e_hr,e_pa,w_hr,w_pa
0,abreujo02,290.314677,7268.147986,412.851337,568.323554
1,aguilje01,167.152468,4387.087083,206.547431,469.089908
2,ahmedni01,274.059099,6890.225394,188.508183,532.507339
3,albieoz01,199.806451,5076.432000,175.488000,282.748400
4,alonsyo01,54.851662,1575.480329,63.129003,182.982617
...,...,...,...,...,...
383,walkech02,235.450631,5706.472000,211.696000,388.661800
384,whiteev01,113.026555,2811.900000,113.300000,281.190000
385,wongko01,197.391474,4773.981651,57.761468,312.698165
386,yastrmi01,111.921318,2784.403670,123.853211,278.440367


In [26]:
marcel_df_sub.drop_duplicates()

Unnamed: 0,playerID,e_hr,e_pa,w_hr,w_pa
0,abreujo02,290.314677,7268.147986,412.851337,568.323554
1,aguilje01,167.152468,4387.087083,206.547431,469.089908
2,ahmedni01,274.059099,6890.225394,188.508183,532.507339
3,albieoz01,199.806451,5076.432000,175.488000,282.748400
4,alonsyo01,54.851662,1575.480329,63.129003,182.982617
...,...,...,...,...,...
373,tatisfe02,146.385045,3641.800000,241.040000,364.180000
376,tuckeky01,128.259958,3190.880000,124.320000,319.088000
379,verdual01,123.583981,3074.550000,82.400000,307.455000
384,whiteev01,113.026555,2811.900000,113.300000,281.190000


## HR per PA

In [27]:
hr_perPAs = []

for index, row in marcel_df_sub.iterrows():
    if int(row['e_pa'] == 0):
        per_pa = 0
        hr_perPAs.append(per_pa)
    else:
        regress_hr = (int(row['e_hr']) * 1200) / (int(row['e_pa']))
        per_pa = (regress_hr + int(row['w_hr'])) / (1200 + (int(row['e_pa'])))

        hr_perPAs.append(per_pa)
    
marcel_df_sub['hr_perPA'] = hr_perPAs
marcel_df_sub.head()

Unnamed: 0,playerID,e_hr,e_pa,w_hr,w_pa,hr_perPA
0,abreujo02,290.314677,7268.147986,412.851337,568.323554,0.054308
1,aguilje01,167.152468,4387.087083,206.547431,469.089908,0.045048
2,ahmedni01,274.059099,6890.225394,188.508183,532.507339,0.029137
3,albieoz01,199.806451,5076.432,175.488,282.7484,0.03538
4,alonsyo01,54.851662,1575.480329,63.129003,182.982617,0.037529


# Projected HR 2021

In [28]:
hrs_2021 = []

for index, row in marcel_df_sub.iterrows():
    ex_hr2021 = ((int(row['w_pa'])) * (float(row['hr_perPA'])))
    hrs_2021.append(ex_hr2021)
    
marcel_df_sub['proj_HR_2021'] = hrs_2021

marcel_df_sub.head()

Unnamed: 0,playerID,e_hr,e_pa,w_hr,w_pa,hr_perPA,proj_HR_2021
0,abreujo02,290.314677,7268.147986,412.851337,568.323554,0.054308,30.84701
1,aguilje01,167.152468,4387.087083,206.547431,469.089908,0.045048,21.127281
2,ahmedni01,274.059099,6890.225394,188.508183,532.507339,0.029137,15.501082
3,albieoz01,199.806451,5076.432,175.488,282.7484,0.03538,9.977162
4,alonsyo01,54.851662,1575.480329,63.129003,182.982617,0.037529,6.83027


In [29]:
#Convert to int

proj_2021 = marcel_df_sub.drop_duplicates()
final_df = proj_2021.sort_values('proj_HR_2021', ascending=False).reset_index(drop=True)

final_df['proj_HR_2021'] = final_df['proj_HR_2021'].astype(int)
final_df.head(10)

Unnamed: 0,playerID,e_hr,e_pa,w_hr,w_pa,hr_perPA,proj_HR_2021
0,troutmi01,298.0907,7512.252,529.484,588.26,0.066185,38
1,suareeu01,303.211506,7623.704,505.4,580.9972,0.062643,36
2,yelicch01,302.52077,7645.356,446.352,594.248,0.055783,33
3,ramirjo01,306.598178,7776.288,443.59,603.5084,0.054615,32
4,cruzne02,238.822008,6038.514075,438.427064,484.568624,0.067049,32
5,olsonma02,302.114438,7649.492,430.486,598.9708,0.053947,32
6,bellico01,319.501307,8042.176,435.18,611.052,0.052218,31
7,bettsmo01,321.565403,8072.964,431.41,609.0392,0.051631,31
8,ozunama01,292.77449,7400.212606,410.184642,587.292661,0.05318,31
9,abreujo02,290.314677,7268.147986,412.851337,568.323554,0.054308,30


# Merge w/ People & Clean

In [30]:
people_df = pd.read_csv('lahman_data/People.csv')

cols = ['playerID', 'nameFirst', 'nameLast']

people_df = people_df[cols]

people_df

Unnamed: 0,playerID,nameFirst,nameLast
0,aardsda01,David,Aardsma
1,aaronha01,Hank,Aaron
2,aaronto01,Tommie,Aaron
3,aasedo01,Don,Aase
4,abadan01,Andy,Abad
...,...,...,...
20085,zupofr01,Frank,Zupo
20086,zuvelpa01,Paul,Zuvella
20087,zuverge01,George,Zuverink
20088,zwilldu01,Dutch,Zwilling


In [31]:
final_df = final_df.merge(people_df, how='left', on='playerID')

cols = ['nameFirst', 'nameLast', 'proj_HR_2021', 'playerID']

# final_df[cols].head()
final_df = final_df[cols]

final_df

Unnamed: 0,nameFirst,nameLast,proj_HR_2021,playerID
0,Mike,Trout,38,troutmi01
1,Eugenio,Suarez,36,suareeu01
2,Christian,Yelich,33,yelicch01
3,Jose,Ramirez,32,ramirjo01
4,Nelson,Cruz,32,cruzne02
...,...,...,...,...
212,Kevin,Newman,1,newmake01
213,Elvis,Andrus,1,andruel01
214,Wilson,Ramos,1,ramoswi01
215,Jose,Iglesias,1,iglesjo01


# Marcel Top 10 HR for 2021

In [32]:
final_df.head(10)

Unnamed: 0,nameFirst,nameLast,proj_HR_2021,playerID
0,Mike,Trout,38,troutmi01
1,Eugenio,Suarez,36,suareeu01
2,Christian,Yelich,33,yelicch01
3,Jose,Ramirez,32,ramirjo01
4,Nelson,Cruz,32,cruzne02
5,Matt,Olson,32,olsonma02
6,Cody,Bellinger,31,bellico01
7,Mookie,Betts,31,bettsmo01
8,Marcell,Ozuna,31,ozunama01
9,Jose,Abreu,30,abreujo02


## Import Statcast Data

In [33]:
stat_2017 = pd.read_csv('statcast/statcast2017.csv')
stat_2018 = pd.read_csv('statcast/statcast2018.csv')
stat_2019 = pd.read_csv('statcast/statcast2019.csv')
stat_2020 = pd.read_csv('statcast/statcast2020.csv')

In [34]:
df_2017 = stat_2017.loc[stat_2017['b_total_pa'] >= 502]
df_2018 = stat_2018.loc[stat_2018['b_total_pa'] >= 502]
df_2019 = stat_2019.loc[stat_2019['b_total_pa'] >= 502]
df_2020 = stat_2020.loc[stat_2020['b_total_pa'] >= 186]

df_list = [df_2017, df_2018, df_2019, df_2020]

In [35]:
df_merged = df_2017.append([df_2018, df_2019, df_2020], ignore_index=True)
df_merged.head()

Unnamed: 0,last_name,first_name,year,player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,...,woba,xwoba,xobp,xiso,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel_batted_rate,whiff_percent,swing_percent
0,Beltran,Carlos,2017,40,467,509,108,65,29,0,...,0.283,0.293,0.291,0.143,87.3,13.4,29.1,4.6,19.8,47.8
1,Pujols,Albert,2017,37,593,636,143,103,17,0,...,0.286,0.322,0.305,0.19,88.7,13.4,31.5,5.4,20.6,47.1
2,Mauer,Joe,2017,34,525,597,160,116,36,1,...,0.349,0.379,0.391,0.17,90.6,6.2,35.7,4.5,12.9,36.4
3,Cabrera,Miguel,2017,34,469,529,117,79,22,0,...,0.313,0.377,0.361,0.247,91.3,12.5,41.7,10.5,23.1,50.3
4,Phillips,Brandon,2017,36,572,604,163,115,34,1,...,0.316,0.307,0.308,0.115,84.7,8.3,31.3,2.4,20.2,56.1


In [36]:
df_merged.corr()['b_home_run']

year                      -0.388722
player_age                -0.057934
b_ab                       0.589536
b_total_pa                 0.627654
b_total_hits               0.573490
b_single                   0.307887
b_double                   0.523215
b_triple                   0.041461
b_home_run                 1.000000
b_strikeout                0.663013
b_walk                     0.646984
b_k_percent                0.153395
b_bb_percent               0.267394
batting_avg                0.138191
slg_percent                0.658986
on_base_percent            0.296653
on_base_plus_slg           0.582644
r_total_caught_stealing   -0.019767
b_hit_by_pitch             0.288803
b_intent_walk              0.464833
b_sac_bunt                -0.222865
b_sac_fly                  0.313641
xba                        0.217817
xslg                       0.636418
woba                       0.555655
xwoba                      0.621667
xobp                       0.336279
xiso                       0

## Random Forest Regressor

In [37]:
def regression_results(y_test, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_test, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_test, y_pred) 
    mse=metrics.mean_squared_error(y_test, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_test, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_test, y_pred)
    r2=metrics.r2_score(y_test, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [38]:
features = ['b_home_run', 'player_age', 'b_total_hits', 'exit_velocity_avg', 'whiff_percent', 
            'swing_percent', 'launch_angle_avg']

ml_df = df_merged[features]

In [39]:
X = ml_df.drop(['b_home_run'], axis=1)
y = ml_df['b_home_run']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
regressor = RandomForestRegressor(random_state=5)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)

In [41]:
regression_results(y_test, preds)

explained_variance:  0.654
mean_squared_log_error:  0.126
r2:  0.6525
MAE:  4.9597
MSE:  42.1632
RMSE:  6.4933


In [42]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': preds})
df.head()

Unnamed: 0,Actual,Predicted
505,11,7.41
408,31,25.17
66,38,19.99
338,8,17.06
233,23,29.27


In [43]:
firsts = []
lasts = []

forest_df = pd.DataFrame()

for index, row in df_merged.iterrows():
    first = row.first_name
    last = row.last_name
    
    firsts.append(first)
    lasts.append(last)
    
forest_df['first_name'] = firsts
forest_df['last_name'] = lasts

## RF Regressor Results

In [44]:
input_df = ml_df.drop(['b_home_run'], axis=1)
outputs = regressor.predict(input_df)

In [45]:
forest_df['Predicted_HR'] = outputs.astype(int)
forest_df = forest_df.sort_values(by='Predicted_HR', ascending=False)

# RF Regression for Season HR

In [46]:
forest_df.head(10).reset_index(drop=True)

Unnamed: 0,first_name,last_name,Predicted_HR
0,Giancarlo,Stanton,51
1,Aaron,Judge,48
2,Jorge,Soler,45
3,Khris,Davis,44
4,Pete,Alonso,44
5,Christian,Yelich,41
6,Cody,Bellinger,41
7,Khris,Davis,41
8,Nelson,Cruz,41
9,J.D.,Martinez,40
