In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split

In [3]:
df = pd.read_csv('Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 

In [4]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   League_A     263 non-null    uint8
 1   League_N     263 non-null    uint8
 2   Division_E   263 non-null    uint8
 3   Division_W   263 non-null    uint8
 4   NewLeague_A  263 non-null    uint8
 5   NewLeague_N  263 non-null    uint8
dtypes: uint8(6)
memory usage: 3.6+ KB
                   League_A  League_N  Division_E  Division_W  NewLeague_A  \
Player                                                                       
-Alan Ashby               0         1           0           1            0   
-Alvin Davis              1         0           0           1            1   
-Andre Dawson             0         1           1           0            0   
-Andres Galarraga         0         1           1           0            0   
-Alfredo Griffin          1         0           0    

In [5]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AtBat        263 non-null    float64
 1   Hits         263 non-null    float64
 2   HmRun        263 non-null    float64
 3   Runs         263 non-null    float64
 4   RBI          263 non-null    float64
 5   Walks        263 non-null    float64
 6   Years        263 non-null    float64
 7   CAtBat       263 non-null    float64
 8   CHits        263 non-null    float64
 9   CHmRun       263 non-null    float64
 10  CRuns        263 non-null    float64
 11  CRBI         263 non-null    float64
 12  CWalks       263 non-null    float64
 13  PutOuts      263 non-null    float64
 14  Assists      263 non-null    float64
 15  Errors       263 non-null    float64
 16  League_N     263 non-null    uint8  
 17  Division_W   263 non-null    uint8  
 18  NewLeague_N  263 non-null    uint8

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [8]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(210, 19)
(210,)
(53, 19)
(53,)


In [9]:
pca_model = PCA()

In [10]:
pca_model.fit(scale(X_train))

PCA()

In [15]:
pca_model.explained_variance_ratio_

array([4.07714092e-01, 1.99058739e-01, 1.02987278e-01, 8.52386531e-02,
       5.23176576e-02, 4.36364021e-02, 3.52524353e-02, 2.59050586e-02,
       1.23233967e-02, 9.66104630e-03, 7.12330964e-03, 6.13944238e-03,
       4.00418109e-03, 3.68266648e-03, 2.72616238e-03, 1.23001031e-03,
       6.42866415e-04, 3.01172776e-04, 5.54296870e-05])

In [14]:
np.cumsum(pca_model.explained_variance_ratio_)

array([0.40771409, 0.60677283, 0.70976011, 0.79499876, 0.84731642,
       0.89095282, 0.92620526, 0.95211032, 0.96443371, 0.97409476,
       0.98121807, 0.98735751, 0.99136169, 0.99504436, 0.99777052,
       0.99900053, 0.9996434 , 0.99994457, 1.        ])

In [37]:
X_train_reduced = pca_model.transform(X_train)
print(pd.DataFrame(X_train_reduced).head())

            0            1           2           3           4           5   \
0   641.786052   119.345207  159.104947  104.851940   72.061116  143.969237   
1  2187.119624  1205.955293  534.785570  626.751900  108.505552   69.060518   
2   714.056035   210.877173  207.591100  253.229767   39.081346   27.466458   
3  1660.717151   452.507036  426.645605  338.363040  173.179754  339.170273   
4   218.517726  -115.458761   87.603640  -23.916567   88.993025  248.451553   

           6           7           8           9           10          11  \
0   81.078039  219.614589  -59.915977  -27.600628  -30.222214   95.917374   
1  198.266452  740.597928 -295.476196 -152.762216 -193.348308  540.029434   
2   66.609276  234.885001    6.240831  -22.464000  -36.018962  121.878756   
3  190.600683  685.520862 -193.711467 -100.038809 -128.981486  335.402660   
4  -11.660619  145.066758   31.040160  -11.100913   -3.741916    9.537277   

            12           13          14          15          1

In [61]:
en_model = ElasticNet(max_iter = 5000)

In [None]:
lr = LinearRegression()

In [62]:
grid = {
    "l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "alpha" : np.linspace(0,5000, 30)
}

In [4]:
np.linspace(0,5000, 30)

array([   0.        ,  172.4137931 ,  344.82758621,  517.24137931,
        689.65517241,  862.06896552, 1034.48275862, 1206.89655172,
       1379.31034483, 1551.72413793, 1724.13793103, 1896.55172414,
       2068.96551724, 2241.37931034, 2413.79310345, 2586.20689655,
       2758.62068966, 2931.03448276, 3103.44827586, 3275.86206897,
       3448.27586207, 3620.68965517, 3793.10344828, 3965.51724138,
       4137.93103448, 4310.34482759, 4482.75862069, 4655.17241379,
       4827.5862069 , 5000.        ])

In [63]:
gcv_en_model = GridSearchCV(estimator = en_model, param_grid = grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1,
                           verbose = 1)
#https://scikit-learn.org/stable/modules/model_evaluation.html #scoring-parameter
#Check the above link for all the scoring options available
#Try different scoring functions, try multiple scoring functions together

In [64]:
gcv_en_model

GridSearchCV(cv=5, estimator=ElasticNet(max_iter=5000), n_jobs=-1,
             param_grid={'alpha': array([   0.        ,  172.4137931 ,  344.82758621,  517.24137931,
        689.65517241,  862.06896552, 1034.48275862, 1206.89655172,
       1379.31034483, 1551.72413793, 1724.13793103, 1896.55172414,
       2068.96551724, 2241.37931034, 2413.79310345, 2586.20689655,
       2758.62068966, 2931.03448276, 3103.44827586, 3275.86206897,
       3448.27586207, 3620.68965517, 3793.10344828, 3965.51724138,
       4137.93103448, 4310.34482759, 4482.75862069, 4655.17241379,
       4827.5862069 , 5000.        ]),
                         'l1_ratio': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                      0.9, 1.0]},
             scoring='neg_mean_squared_error', verbose=1)

In [65]:
gcv_en_model.fit(X_train_reduced, y_train)

Fitting 5 folds for each of 330 candidates, totalling 1650 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1104 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1650 out of 1650 | elapsed:    4.9s finished


GridSearchCV(cv=5, estimator=ElasticNet(max_iter=5000), n_jobs=-1,
             param_grid={'alpha': array([   0.        ,  172.4137931 ,  344.82758621,  517.24137931,
        689.65517241,  862.06896552, 1034.48275862, 1206.89655172,
       1379.31034483, 1551.72413793, 1724.13793103, 1896.55172414,
       2068.96551724, 2241.37931034, 2413.79310345, 2586.20689655,
       2758.62068966, 2931.03448276, 3103.44827586, 3275.86206897,
       3448.27586207, 3620.68965517, 3793.10344828, 3965.51724138,
       4137.93103448, 4310.34482759, 4482.75862069, 4655.17241379,
       4827.5862069 , 5000.        ]),
                         'l1_ratio': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                      0.9, 1.0]},
             scoring='neg_mean_squared_error', verbose=1)

In [71]:
gcv_en_model.cv_results_

{'mean_fit_time': array([0.05404634, 0.05329208, 0.05554714, 0.05292664, 0.05394893,
        0.05561795, 0.05387955, 0.05373378, 0.06055737, 0.05577245,
        0.05436082, 0.06029787, 0.05550928, 0.05303102, 0.05497909,
        0.05083241, 0.04872174, 0.04702692, 0.04273162, 0.04753556,
        0.0444088 , 0.0422966 , 0.06208382, 0.05444746, 0.05466528,
        0.04829173, 0.04625411, 0.04325171, 0.03950677, 0.04053345,
        0.03335748, 0.03592243, 0.03607764, 0.05955629, 0.05318899,
        0.044595  , 0.04629197, 0.0366992 , 0.03349266, 0.03248286,
        0.02706113, 0.02342677, 0.02020059, 0.02672606, 0.06231194,
        0.04812651, 0.03703752, 0.03259034, 0.03177991, 0.02874994,
        0.021982  , 0.01958351, 0.01984181, 0.0174026 , 0.0173954 ,
        0.06498222, 0.03804708, 0.02908945, 0.02761884, 0.02145672,
        0.02303839, 0.01818142, 0.01978636, 0.01353102, 0.01276689,
        0.01447878, 0.05909867, 0.03710093, 0.02448244, 0.02088704,
        0.01635737, 0.01638722,

In [72]:
cv_results = pd.DataFrame(gcv_en_model.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.054046,0.001313,0.001596,0.000488,0.0,0.0,"{'alpha': 0.0, 'l1_ratio': 0}",-118792.57922,-70971.865916,-150062.507028,-156085.133634,-95778.837184,-118338.184596,32199.26574,320
1,0.053292,0.004487,0.001339,0.000429,0.0,0.1,"{'alpha': 0.0, 'l1_ratio': 0.1}",-118792.57922,-70971.865916,-150062.507028,-156085.133634,-95778.837184,-118338.184596,32199.26574,320
2,0.055547,0.00287,0.001396,0.000489,0.0,0.2,"{'alpha': 0.0, 'l1_ratio': 0.2}",-118792.57922,-70971.865916,-150062.507028,-156085.133634,-95778.837184,-118338.184596,32199.26574,320
3,0.052927,0.006621,0.00183,0.000745,0.0,0.3,"{'alpha': 0.0, 'l1_ratio': 0.3}",-118792.57922,-70971.865916,-150062.507028,-156085.133634,-95778.837184,-118338.184596,32199.26574,320
4,0.053949,0.007782,0.001415,0.000496,0.0,0.4,"{'alpha': 0.0, 'l1_ratio': 0.4}",-118792.57922,-70971.865916,-150062.507028,-156085.133634,-95778.837184,-118338.184596,32199.26574,320


In [74]:
cv_results.loc[cv_results["rank_test_score"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
108,0.008838,0.002203,0.001305,0.000389,1551.724138,0.9,"{'alpha': 1551.7241379310346, 'l1_ratio': 0.9}",-111915.31328,-61858.657969,-148032.96576,-140201.232754,-95645.260328,-111530.686018,31222.497448,1
109,0.013433,0.00619,0.001431,0.000578,1551.724138,1.0,"{'alpha': 1551.7241379310346, 'l1_ratio': 1.0}",-112121.015825,-61763.731582,-148233.030685,-140153.105882,-95881.406779,-111630.45815,31267.406142,4
118,0.007992,0.001199,0.001496,0.000638,1724.137931,0.8,"{'alpha': 1724.1379310344828, 'l1_ratio': 0.8}",-112349.352289,-61909.486446,-148225.521292,-139239.389005,-95998.003297,-111544.350466,31043.453794,2
119,0.00868,0.001616,0.001078,0.00049,1724.137931,0.9,"{'alpha': 1724.1379310344828, 'l1_ratio': 0.9}",-112621.229264,-61833.036357,-148173.870325,-139158.6649,-96253.61648,-111608.083465,31017.520022,3
128,0.007689,0.000769,0.001397,0.000488,1896.551724,0.7,"{'alpha': 1896.5517241379312, 'l1_ratio': 0.7}",-112742.311027,-62031.506358,-148688.51166,-138380.94816,-96328.85359,-111634.426159,30933.216143,5


In [75]:
gcv_en_model.best_estimator_

ElasticNet(alpha=1551.7241379310346, l1_ratio=0.9, max_iter=5000)

In [76]:
gcv_en_model.best_score_

-111530.68601797029

In [77]:
gcv_en_model.best_params_

{'alpha': 1551.7241379310346, 'l1_ratio': 0.9}

# Predicting on the test set

In [78]:
X_test_reduced = pca_model.transform(X_test)
pd.DataFrame(X_test_reduced)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,287.672629,-128.112086,126.103972,-19.158982,122.927252,342.558277,-44.401597,221.865078,49.152489,-22.552049,-10.455385,22.457622,67.932353,-55.313746,-142.325233,-153.366289,-64.845186,-198.66091,94.557884
1,1298.073648,428.999394,343.838625,416.47987,65.34259,21.937475,151.038025,466.339939,-52.073636,-41.36723,-69.800013,228.144661,562.817167,-537.192469,-359.937818,-314.709326,-429.708238,-1227.627077,572.419475
2,1078.348858,246.400148,357.317289,425.159704,77.910365,143.317715,143.811542,426.653158,49.13234,-82.739633,-104.444793,231.586761,469.921775,-437.276305,-331.443716,-295.160545,-274.318868,-1052.252899,445.003273
3,179.795487,-70.75146,48.927874,16.882052,33.61538,65.322669,33.518225,121.354343,-13.798346,2.039371,3.06791,4.085417,39.497694,-20.133844,-115.268229,-108.493805,-41.804319,-113.485783,49.639608
4,556.070936,129.17529,126.705227,131.149079,33.927229,11.46992,81.926938,241.88661,-69.575529,-19.578364,-17.165077,75.61198,231.365972,-202.677396,-191.302859,-174.634518,-181.508695,-540.451382,245.435272
5,1395.848566,510.658702,346.226605,304.982854,124.971198,191.16298,106.546903,517.000401,-138.095397,-48.650767,-68.729101,246.60728,624.262518,-571.337218,-370.37742,-305.985094,-510.419238,-1378.418256,600.53005
6,1743.201455,459.446283,488.562086,601.50683,94.430176,32.923417,165.996459,647.215536,33.432089,-29.697741,-76.704242,279.024664,681.97083,-736.706107,-499.71612,-469.47361,-568.930028,-1660.295607,810.717595
7,843.71406,-5.418912,222.963189,145.091011,107.547402,212.231497,150.375462,429.577297,-101.066978,-29.87426,-40.985454,121.284732,292.279711,-238.885041,-386.550799,-385.160026,-191.966529,-708.859799,331.316659
8,1442.452844,314.63438,453.136914,510.35077,112.177082,209.076323,199.491347,522.455328,42.219338,-86.38508,-127.027371,295.223644,611.418973,-520.031183,-455.271512,-416.484201,-375.060937,-1362.490164,590.317935
9,819.722535,276.778859,201.813299,177.373502,71.951089,109.962122,70.251599,277.11693,-69.672388,-14.863665,-43.674615,145.110412,350.933541,-280.635975,-236.335052,-212.393235,-275.328918,-782.538229,373.174147


In [79]:
y_test_pred = gcv_en_model.predict(X_test_reduced)
y_test_pred[:10]

array([279.4413364 , 456.78985653, 414.39805518, 164.29546559,
       208.88702087, 496.6242579 , 567.95001497, 404.34116537,
       540.8813218 , 297.71882607])

In [81]:
mse =  mean_squared_error(y_test_pred, y_test)

In [82]:
mse

142298.83078340327

In [88]:
#Quiet Close

In [89]:
#Try the same methodology for a classification dataset

In [93]:
#Try and check if you can get a better model without applying PCA, rather a normal grid search CV for Elastic net

In [91]:
gcv_en_model.best_estimator_

ElasticNet(alpha=1551.7241379310346, l1_ratio=0.9, max_iter=5000)

In [92]:
gcv_en_model.best_estimator_.coef_

array([ 1.00844202, -0.        ,  0.        ,  0.        ,  0.        ,
        0.26774137,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.4754821 , -0.        ,
       -0.        ,  0.        ,  0.69104208, -0.57297901])