In [113]:
pip install glmnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [115]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

In [116]:
df = pd.read_csv('Hitters.csv').dropna()
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64

In [117]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [118]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   League_A     263 non-null    uint8
 1   League_N     263 non-null    uint8
 2   Division_E   263 non-null    uint8
 3   Division_W   263 non-null    uint8
 4   NewLeague_A  263 non-null    uint8
 5   NewLeague_N  263 non-null    uint8
dtypes: uint8(6)
memory usage: 3.6 KB
   League_A  League_N  Division_E  Division_W  NewLeague_A  NewLeague_N
1         0         1           0           1            0            1
2         1         0           0           1            1            0
3         0         1           1           0            0            1
4         0         1           1           0            0            1
5         1         0           0           1            1            0


In [119]:
from sklearn.preprocessing import StandardScaler

In [120]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X_transform = pd.DataFrame(StandardScaler().fit_transform(X_),columns = X_.columns) 
X = pd.concat([X_transform.reset_index(drop = True), dummies[['League_N', 'Division_W', 'NewLeague_N']].reset_index(drop = True)], axis=1, ignore_index = True)
print(X.shape)
X.info()

(263, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       263 non-null    float64
 1   1       263 non-null    float64
 2   2       263 non-null    float64
 3   3       263 non-null    float64
 4   4       263 non-null    float64
 5   5       263 non-null    float64
 6   6       263 non-null    float64
 7   7       263 non-null    float64
 8   8       263 non-null    float64
 9   9       263 non-null    float64
 10  10      263 non-null    float64
 11  11      263 non-null    float64
 12  12      263 non-null    float64
 13  13      263 non-null    float64
 14  14      263 non-null    float64
 15  15      263 non-null    float64
 16  16      263 non-null    uint8  
 17  17      263 non-null    uint8  
 18  18      263 non-null    uint8  
dtypes: float64(16), uint8(3)
memory usage: 33.8 KB


In [121]:
X.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.6029,-0.595675,-0.528551,-1.206112,-0.522063,-0.097527,1.397893,0.346791,0.174373,-0.00292,-0.121671,0.258966,0.435334,1.221499,-0.523191,0.213352,1,1,1
1,0.512542,0.49226,0.729966,0.441515,0.79406,1.609373,-0.9012,-0.452865,-0.409892,-0.076054,-0.415105,-0.19959,0.010373,2.109109,-0.253863,0.819964,0,1,0
2,0.628167,0.73649,0.958788,0.402286,1.026317,-0.189792,0.770868,1.301558,1.318174,1.898565,1.412051,1.572666,0.355654,-0.324661,-0.744179,-0.848219,1,0,1
3,-0.562092,-0.462459,-0.185319,-0.617673,-0.367225,-0.512719,-1.110209,-0.990935,-0.960153,-0.697693,-0.947521,-0.881228,-0.862315,1.840678,-0.543909,-0.696566,1,0,1
4,1.294712,1.358167,-0.871783,0.755349,-0.01884,-0.282057,0.770868,0.766993,0.634985,-0.61237,0.422846,0.017294,-0.251434,-0.031177,2.087225,2.488147,0,1,0


In [122]:
from sklearn.model_selection import train_test_split

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = 0.8)

In [124]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 248 to 96
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       210 non-null    float64
 1   1       210 non-null    float64
 2   2       210 non-null    float64
 3   3       210 non-null    float64
 4   4       210 non-null    float64
 5   5       210 non-null    float64
 6   6       210 non-null    float64
 7   7       210 non-null    float64
 8   8       210 non-null    float64
 9   9       210 non-null    float64
 10  10      210 non-null    float64
 11  11      210 non-null    float64
 12  12      210 non-null    float64
 13  13      210 non-null    float64
 14  14      210 non-null    float64
 15  15      210 non-null    float64
 16  16      210 non-null    uint8  
 17  17      210 non-null    uint8  
 18  18      210 non-null    uint8  
dtypes: float64(16), uint8(3)
memory usage: 28.5 KB


In [125]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(210, 19)
(53, 19)
(210,)
(53,)


In [126]:
from sklearn.metrics import mean_squared_error

In [127]:
ridge2 = Ridge(alpha=5)
ridge2.fit(X_train, y_train)
pred = ridge2.predict(X_test)
mean_squared_error(y_test, pred)

68050.22593867643

In [128]:
ridge2 = Ridge(alpha=50)
ridge2.fit(X_train, y_train)
pred = ridge2.predict(X_test)
mean_squared_error(y_test, pred)

64991.11491756731

In [129]:
ridge2 = Ridge(alpha=500)
ridge2.fit(X_train, y_train)
pred = ridge2.predict(X_test)
mean_squared_error(y_test, pred)

77926.5607831179

In [130]:
ridge2 = Ridge(alpha=3)
ridge2.fit(X_train, y_train)
pred = ridge2.predict(X_test)
mean_squared_error(y_test, pred)

69428.96346944285

In [134]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas

array([5.00000000e+09, 3.78231664e+09, 2.86118383e+09, 2.16438064e+09,
       1.63727458e+09, 1.23853818e+09, 9.36908711e+08, 7.08737081e+08,
       5.36133611e+08, 4.05565415e+08, 3.06795364e+08, 2.32079442e+08,
       1.75559587e+08, 1.32804389e+08, 1.00461650e+08, 7.59955541e+07,
       5.74878498e+07, 4.34874501e+07, 3.28966612e+07, 2.48851178e+07,
       1.88246790e+07, 1.42401793e+07, 1.07721735e+07, 8.14875417e+06,
       6.16423370e+06, 4.66301673e+06, 3.52740116e+06, 2.66834962e+06,
       2.01850863e+06, 1.52692775e+06, 1.15506485e+06, 8.73764200e+05,
       6.60970574e+05, 5.00000000e+05, 3.78231664e+05, 2.86118383e+05,
       2.16438064e+05, 1.63727458e+05, 1.23853818e+05, 9.36908711e+04,
       7.08737081e+04, 5.36133611e+04, 4.05565415e+04, 3.06795364e+04,
       2.32079442e+04, 1.75559587e+04, 1.32804389e+04, 1.00461650e+04,
       7.59955541e+03, 5.74878498e+03, 4.34874501e+03, 3.28966612e+03,
       2.48851178e+03, 1.88246790e+03, 1.42401793e+03, 1.07721735e+03,
      

In [132]:
len(alphas)

100

In [135]:
ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error')
ridgecv.fit(X_train, y_train)

In [136]:
ridgecv.alpha_

1.004616501282523

In [137]:
ridge2.set_params(alpha=ridgecv.alpha_)
ridge2.fit(X_train, y_train)
mean_squared_error(y_test, ridge2.predict(X_test))

73076.96692647859

In [138]:
lassocv = LassoCV(alphas=None, cv=10, max_iter=10000)
lassocv.fit(X_train, y_train)

In [139]:
lassocv.alpha_

3.5894095693056522

In [140]:
lasso2 = Lasso()

In [141]:
lasso2.set_params(alpha=lassocv.alpha_)
lasso2.fit(X_train, y_train)
mean_squared_error(y_test, lasso2.predict(X_test))

68247.03597640192