In [23]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE 

In [24]:
batting = pd.read_csv('data/baseballdatabank-master/core/batting.csv')

In [25]:
batting['OBP'] = (batting['H'] + batting['BB'] + batting['HBP'])/(batting['AB'] + batting['BB'] + batting['HBP'] + batting['SF'])

In [26]:
batting = batting[batting['yearID'] >= 2010]

In [27]:
batting = batting.dropna()

In [28]:
batting.shape

(10862, 23)

In [29]:
batting = batting.drop(columns=['playerID','stint','teamID','lgID','CS','G'])

In [30]:
training_data = batting[batting['yearID'] < 2020]

In [31]:
testing_data = batting[batting['yearID'] == 2020]

In [32]:
X = training_data.drop(columns='OBP')
y = training_data['OBP']

In [33]:
scale = RobustScaler()
scale.fit(X)
scale.transform(X)

array([[-1.        , -0.22878229, -0.17647059, ...,  0.        ,
         0.        , -0.2       ],
       [-1.        ,  1.88191882,  2.41176471, ...,  0.        ,
         2.5       ,  2.4       ],
       [-1.        ,  0.4797048 ,  0.29411765, ...,  0.        ,
         2.        ,  1.4       ],
       ...,
       [ 0.8       ,  0.39852399,  0.41176471, ...,  0.        ,
         1.        ,  0.6       ],
       [ 0.8       ,  0.32103321,  0.52941176, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.8       ,  0.74907749,  0.70588235, ...,  0.        ,
         0.        ,  0.6       ]])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=.8)

In [37]:
lasso = Lasso()

In [38]:
lasso.fit(X_train,y_train)

Lasso()

In [39]:
y_pred = lasso.predict(X_test)

In [40]:
metrics.r2_score(y_test, y_pred)

0.13258962364561067

In [41]:
y_2020 = testing_data['OBP']

In [42]:
X_2020 = testing_data.drop(columns='OBP')

In [43]:
y_2020_pred = lasso.predict(scale.transform(X_2020))

In [44]:
metrics.r2_score(y_2020,y_2020_pred)

-1.1621076886071355

In [45]:
y_2020_pred

array([0.19205981, 0.19196698, 0.19199599, 0.19178248, 0.19183818,
       0.19192521, 0.19188459, 0.19199947, 0.19201224, 0.19196118,
       0.19203544, 0.19191824, 0.19180568, 0.19188923, 0.19179988,
       0.19179524, 0.19181729, 0.19181033, 0.19178248, 0.19179408,
       0.19181613, 0.19202268, 0.19200411, 0.19182425, 0.1917906 ,
       0.1920134 , 0.19202268, 0.19190084, 0.19185326, 0.19180452,
       0.19183586, 0.19186487, 0.19198207, 0.19199251, 0.19185558,
       0.19191128, 0.19183934, 0.19179988, 0.19183818, 0.19190432,
       0.19203893, 0.19188111, 0.19189504, 0.19179292, 0.19180104,
       0.19190084, 0.19179756, 0.19183934, 0.19202848, 0.19200759,
       0.19195422, 0.19183702, 0.19182657, 0.19192056, 0.19203544,
       0.19192405, 0.1920366 , 0.19181613, 0.19203776, 0.1917848 ,
       0.19201688, 0.19196698, 0.1917848 , 0.19188111, 0.19192637,
       0.19200295, 0.19197858, 0.1917848 , 0.19193101, 0.19195886,
       0.19178712, 0.19190432, 0.19188111, 0.19178712, 0.19189