In [20]:
import pandas as pd
import numpy as np

In [21]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

In [22]:
from sklearn.metrics import mean_squared_error

In [23]:
df = pd.read_csv('Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 

In [24]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   League_A     263 non-null    uint8
 1   League_N     263 non-null    uint8
 2   Division_E   263 non-null    uint8
 3   Division_W   263 non-null    uint8
 4   NewLeague_A  263 non-null    uint8
 5   NewLeague_N  263 non-null    uint8
dtypes: uint8(6)
memory usage: 3.6+ KB
                   League_A  League_N  Division_E  Division_W  NewLeague_A  \
Player                                                                       
-Alan Ashby               0         1           0           1            0   
-Alvin Davis              1         0           0           1            1   
-Andre Dawson             0         1           1           0            0   
-Andres Galarraga         0         1           1           0            0   
-Alfredo Griffin          1         0           0    

In [27]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, -Alan Ashby to -Willie Wilson
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AtBat        263 non-null    float64
 1   Hits         263 non-null    float64
 2   HmRun        263 non-null    float64
 3   Runs         263 non-null    float64
 4   RBI          263 non-null    float64
 5   Walks        263 non-null    float64
 6   Years        263 non-null    float64
 7   CAtBat       263 non-null    float64
 8   CHits        263 non-null    float64
 9   CHmRun       263 non-null    float64
 10  CRuns        263 non-null    float64
 11  CRBI         263 non-null    float64
 12  CWalks       263 non-null    float64
 13  PutOuts      263 non-null    float64
 14  Assists      263 non-null    float64
 15  Errors       263 non-null    float64
 16  League_N     263 non-null    uint8  
 17  Division_W   263 non-null    uint8  
 18  NewLeague_N  263 non-null    uint8

In [26]:
pca = PCA()
X_reduced = pca.fit_transform(scale(X))

In [28]:
print(pca.components_.shape)

(19, 19)


In [29]:
pca.components_

array([[ 1.98290351e-01,  1.95861293e-01,  2.04368923e-01,
         1.98337092e-01,  2.35173803e-01,  2.08923752e-01,
         2.82575450e-01,  3.30462926e-01,  3.30741680e-01,
         3.18979493e-01,  3.38207859e-01,  3.40342839e-01,
         3.16802936e-01,  7.76971752e-02, -8.41641266e-04,
        -7.85936949e-03, -5.44708722e-02, -2.57252900e-02,
        -4.19103083e-02],
       [-3.83784030e-01, -3.77271117e-01, -2.37135612e-01,
        -3.77721344e-01, -3.14531203e-01, -2.29606097e-01,
         2.62401948e-01,  1.92903821e-01,  1.82898829e-01,
         1.26297318e-01,  1.72276112e-01,  1.68092078e-01,
         1.92314962e-01, -1.55736631e-01, -1.68651886e-01,
        -2.00759919e-01,  9.52132358e-02,  3.66795693e-02,
         7.75835646e-02],
       [ 8.86259262e-02,  7.40322605e-02, -2.16185630e-01,
        -1.71664221e-02, -7.30853444e-02,  4.56359160e-02,
         3.45809704e-02,  8.35744193e-02,  8.62510743e-02,
        -8.62723280e-02,  5.29956515e-02,  1.49927391e-02,
    

In [30]:
pd.DataFrame(pca.components_.T).loc[:4,:5]

Unnamed: 0,0,1,2,3,4,5
0,0.19829,-0.383784,0.088626,0.031967,0.028117,-0.070646
1,0.195861,-0.377271,0.074032,0.017982,-0.004652,-0.08224
2,0.204369,-0.237136,-0.216186,-0.235831,0.07766,-0.149646
3,0.198337,-0.377721,-0.017166,-0.049942,-0.038536,-0.13666
4,0.235174,-0.314531,-0.073085,-0.138985,0.024299,-0.111675


In [32]:
print(X_reduced.shape)
pd.DataFrame(X_reduced).loc[:4,:4]

(263, 19)


Unnamed: 0,0,1,2,3,4
0,-0.009649,1.870522,1.265145,-0.935481,1.109636
1,0.411434,-2.429422,-0.909193,-0.264212,1.232031
2,3.466822,0.825947,0.555469,-1.616726,-0.857488
3,-2.558317,-0.230984,0.519642,-2.176251,-0.820301
4,1.027702,-1.573537,1.331382,3.494004,0.983427


In [33]:
pca.explained_variance_ratio_

array([3.83142396e-01, 2.18410758e-01, 1.06863592e-01, 8.19251975e-02,
       5.25608081e-02, 4.34450422e-02, 3.62810828e-02, 2.70015556e-02,
       1.31964802e-02, 9.72721749e-03, 7.22341252e-03, 6.70946072e-03,
       5.03086617e-03, 3.21246472e-03, 2.73557843e-03, 1.47396692e-03,
       7.41715623e-04, 2.55915858e-04, 6.24891919e-05])

In [34]:
# Variance explained by the principal components
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

array([38.31, 60.15, 70.84, 79.03, 84.29, 88.63, 92.26, 94.96, 96.28,
       97.25, 97.97, 98.64, 99.14, 99.46, 99.73, 99.88, 99.95, 99.98,
       99.99])

In [36]:
pls = PLSRegression(n_components=2)
pls.fit(scale(X), y)

PLSRegression()

In [37]:
pls.x_scores_

array([[-1.09016869e-01, -8.79474174e-02],
       [ 6.67094747e-01,  8.78568677e-01],
       [ 3.47170209e+00,  5.27049567e-01],
       [-2.12985940e+00,  2.45419359e+00],
       [ 9.77084193e-01, -7.93661613e-01],
       [-4.00366856e+00,  1.49999021e-01],
       [-3.66849693e+00, -1.34396286e+00],
       [-3.42620626e+00, -3.02668072e-01],
       [ 3.51841988e+00, -1.37455643e+00],
       [ 3.29319170e+00,  1.71587394e-01],
       [-2.30123005e+00, -4.60924189e-01],
       [-6.86753352e-01,  1.27417723e+00],
       [-2.44684707e+00, -5.48789090e-01],
       [-1.20533560e+00, -2.32715786e-01],
       [ 5.48065676e+00, -7.10108781e-01],
       [-3.88520123e+00, -1.73910837e+00],
       [-3.47816827e+00, -8.21788232e-01],
       [-1.35015126e+00,  1.89883103e+00],
       [-1.98435971e+00,  6.04910877e-01],
       [ 3.06549186e-01,  1.00852303e+00],
       [ 6.05972111e+00,  9.64035082e-01],
       [ 1.06948319e+00,  1.39831018e+00],
       [-2.00864765e+00,  6.90510621e-01],
       [ 1.

In [None]:
mean_squared_error(y, pls.predict(scale(X)))