In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import read_table

In [164]:
df_bat = read_table("PlayerBatting")
df_bat = df_bat.drop(columns=["Unnamed: 0"])

In [165]:
df_bat["PA"] = df_bat["AB"] + df_bat["BB"] + df_bat["IBB"] + df_bat["HBP"]
df_bat["1B"] = df_bat["H"] - df_bat["2B"] - df_bat["3B"] - df_bat["HR"]

In [166]:
# On base percentage: BB + IBB + HBP + H / PA
# H - 2B - 3B - HR = 1B
# slugging: (total number of bases per AB on average) 1B + 22B + 33B + 4HR / AB

# On-base + slugging: Adding them together (OPS)
df_bat["OnBaseP"] = (df_bat["BB"] + df_bat["IBB"] + df_bat["HBP"] + df_bat["H"]) / df_bat["PA"]
df_bat["Slugging"] = (df_bat["1B"] + 2*df_bat["2B"] + 3*df_bat["3B"] + 4*df_bat["HR"]) / df_bat["AB"]
df_bat["OPS"] = df_bat["OnBaseP"] + df_bat["Slugging"] 

In [167]:
# Create the NextYearOPS column that contains for every row the OPS value for next year.
# For a player's last year, the value is set to NaN and the row is deleted afterwards.

df_bat_temp = df_bat[["playerID", "yearID", "OPS"]]
OPS = df_bat_temp["OPS"].shift(-1)
players = df_bat_temp["playerID"]
for i in range(0, len(OPS) - 1):
    if players[i] != players[i+1]:
        OPS[i] = np.nan
df_bat["NextYearOPS"] = OPS
df_bat = df_bat.dropna(subset=['NextYearOPS', 'AB', 'PA', 'OPS', 'Slugging', 'OnBaseP'])
df_bat = df_bat.reset_index()
df_bat["OPS_will_increase"] = df_bat["NextYearOPS"] > df_bat["OPS"]

In [168]:
df_bat

Unnamed: 0,index,playerID,yearID,stint,teamID,lgID,G,AB,R,H,...,debut,current_age,years_of_experience,PA,1B,OnBaseP,Slugging,OPS,NextYearOPS,OPS_will_increase
0,0,abbotku01,1993,1,OAK,AL,20,61.0,11.0,15.0,...,1993-09-07,24.0,0,64.0,11.0,0.281250,0.409836,0.691086,0.688481,False
1,1,abbotku01,1994,1,FLO,NL,101,345.0,41.0,86.0,...,1993-09-07,25.0,1,367.0,57.0,0.294278,0.394203,0.688481,0.779263,True
2,2,abbotku01,1995,1,FLO,NL,120,420.0,60.0,107.0,...,1993-09-07,26.0,2,465.0,65.0,0.326882,0.452381,0.779263,0.737374,False
3,3,abbotku01,1996,1,FLO,NL,109,320.0,37.0,81.0,...,1993-09-07,27.0,3,346.0,48.0,0.309249,0.428125,0.737374,0.754762,True
4,4,abbotku01,1997,1,FLO,NL,94,252.0,35.0,69.0,...,1993-09-07,28.0,4,270.0,43.0,0.322222,0.432540,0.754762,0.718602,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12467,13749,zobribe01,2011,1,TBA,AL,156,588.0,99.0,158.0,...,2006-08-01,30.0,5,668.0,86.0,0.356287,0.469388,0.825675,0.858235,True
12468,13750,zobribe01,2012,1,TBA,AL,157,560.0,88.0,151.0,...,2006-08-01,31.0,6,667.0,85.0,0.386807,0.471429,0.858235,0.763112,False
12469,13751,zobribe01,2013,1,TBA,AL,157,612.0,77.0,168.0,...,2006-08-01,32.0,7,695.0,117.0,0.361151,0.401961,0.763112,0.756275,False
12470,13752,zobribe01,2014,1,TBA,AL,146,570.0,83.0,155.0,...,2006-08-01,33.0,8,650.0,108.0,0.361538,0.394737,0.756275,0.809771,True


In [169]:
def find_cutoff_index(playerIDlist, test_ratio):
    # returns a correct index for cutting off training and test data, must include the index in training data
    if not (test_ratio < 1 and test_ratio > 0): return Exception("Fuck you")
    approx_index = int(len(playerIDlist) * (1 - test_ratio))
    approx_index_is_correct = playerIDlist[approx_index] != playerIDlist[approx_index + 1]
    while not approx_index_is_correct:
        approx_index += 1
        approx_index_is_correct = playerIDlist[approx_index] != playerIDlist[approx_index + 1]
    return approx_index

split_index = find_cutoff_index(df_bat["playerID"].tolist(), 0.15)

In [170]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

In [179]:
df_reg_bat = df_bat.drop(columns=["index", "playerID", "stint", "teamID", "lgID", "birthYear", "debut", "NextYearOPS", "OPS_will_increase"])
df_reg_bat = df_reg_bat.drop(columns=["OPS", "Slugging", "OnBaseP"])
targets = df_bat["NextYearOPS"]

train_df = np.array(df_reg_bat[:split_index])
test_df = np.array(df_reg_bat[split_index:])

train_targets = np.array(targets[:split_index])
test_targets = np.array(targets[split_index:])

In [180]:
# a stupid model that predicts a player will keep the same OPS as last year
class StupidRegressionBaseline():
    def predict(self, test_df):
        return test_df[:, -1]

baseline = StupidRegressionBaseline()
y_pred_baseline = baseline.predict(test_df)
mse = mean_squared_error(test_targets, y_pred_baseline)
print("mse: ", mse)

mse:  4608.341099521589


In [181]:
# the random forests performs a little better than the stupid baseline!
rf_regressor = RandomForestRegressor(n_estimators=69)
rf_regressor.fit(train_df, train_targets)
y_pred = rf_regressor.predict(test_df)
mse = mean_squared_error(test_targets, y_pred)
print("mse: ", mse)

mse:  0.01755562881860679


In [182]:
importances = rf_regressor.feature_importances_
feature_names = df_reg_bat.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

Unnamed: 0,Feature,Importance
7,HR,0.171232
0,yearID,0.061153
11,BB,0.058285
1,G,0.057025
12,SO,0.055523
8,RBI,0.046476
18,weight,0.045846
3,R,0.044634
20,current_age,0.044438
2,AB,0.038516


In [183]:
df_clf_bat = df_reg_bat
targets = df_bat["OPS_will_increase"]

train_df = np.array(df_clf_bat[:split_index])
test_df = np.array(df_clf_bat[split_index:])

train_targets = np.array(targets[:split_index])
test_targets = np.array(targets[split_index:])

In [184]:
class StupidClassificationBaseline():
    def __init__(self) -> None:
        majority_class = False
    def fit(self, _, train_targets):
        self.majority_class = np.bincount(train_targets).argmax()
    def predict(self, test_df):
        return np.full(len(test_df), self.majority_class)
    
    
baseline = StupidClassificationBaseline()
baseline.fit(train_df, train_targets)
y_pred_baseline = baseline.predict(test_df)
acc = accuracy_score(test_targets, y_pred_baseline)
print("Accuracy: ", acc)

Accuracy:  0.5314685314685315


In [185]:
rf_classifier = RandomForestClassifier(n_estimators=69)
rf_classifier.fit(train_df, train_targets)
y_pred = rf_classifier.predict(test_df)
acc = accuracy_score(test_targets, y_pred)
print("Accuracy: ", acc)


Accuracy:  0.6444324905863368


In [186]:
importances = rf_classifier.feature_importances_
feature_names = df_clf_bat.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

Unnamed: 0,Feature,Importance
2,AB,0.064415
4,H,0.062395
22,PA,0.059946
12,SO,0.054076
1,G,0.053582
8,RBI,0.052542
23,1B,0.051626
3,R,0.050065
5,2B,0.048248
0,yearID,0.047471
