# PCR MODEL

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [6]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna() # delete nulls
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AtBat,263.0,403.642586,147.307209,19.0,282.5,413.0,526.0,687.0
Hits,263.0,107.828897,45.125326,1.0,71.5,103.0,141.5,238.0
HmRun,263.0,11.619772,8.757108,0.0,5.0,9.0,18.0,40.0
Runs,263.0,54.745247,25.539816,0.0,33.5,52.0,73.0,130.0
RBI,263.0,51.486692,25.882714,0.0,30.0,47.0,71.0,121.0
Walks,263.0,41.114068,21.718056,0.0,23.0,37.0,57.0,105.0
Years,263.0,7.311787,4.793616,1.0,4.0,6.0,10.0,24.0
CAtBat,263.0,2657.543726,2286.582929,19.0,842.5,1931.0,3890.5,14053.0
CHits,263.0,722.186312,648.199644,4.0,212.0,516.0,1054.0,4256.0
CHmRun,263.0,69.239544,82.197581,0.0,15.0,40.0,92.5,548.0


In [11]:
dms = pd.get_dummies(df[["League",'Division','NewLeague']])
dms.head()
# get_dummies = Kategorik değişkeni kukla / gösterge değişkenlerine dönüştürür.
# Örneğin: League_A 0 index'e sahip satırda vardır.

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [13]:
# dummy değişken tuzağı vardır. 2.tekrar eden değişkenleri silmemiz gerekiyor.
y = df["Salary"]
X_ = df.drop(["Salary","League",'Division','NewLeague'],axis=1).astype("float64")

In [15]:
# bağımlı değişkenlerin çıkarılmış hali
X_.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0


In [17]:
X = pd.concat([X_,dms[["League_N",'Division_W','NewLeague_N']]],axis=1)
X.head()
# Bu şekilde tekrarlanan veriler birleştirildi. Dummy uygulandı.

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
print("X train", X_train.shape)
print("y train", y_train.shape)
print("X test", X_test.shape)
print("y test", y_test.shape)
training = df.copy()
print("training: ", training.shape)

X train (197, 19)
y train (197,)
X test (66, 19)
y test (66,)
training:  (263, 20)


In [35]:
from sklearn.decomposition import pca
from sklearn.preprocessing import scale
pca = PCA()

In [36]:
X_reduce_train = pca.fit_transform(scale(X_train))

In [37]:
X_reduce_train

array([[-2.49569913e+00, -3.37762397e-01,  7.06391950e-01, ...,
         3.70733348e-03,  1.37933445e-03, -6.63814471e-03],
       [ 1.57074119e+00, -6.26413698e-01,  3.18877005e+00, ...,
         2.72401324e-02, -1.68108943e-01,  1.20560996e-01],
       [ 1.50009234e+00, -1.72148910e+00, -1.68448622e+00, ...,
         5.53398813e-02, -9.08519513e-02, -1.15464804e-02],
       ...,
       [-3.77250745e-01, -3.00549313e+00, -1.82464645e+00, ...,
         1.23657225e-03,  1.73951625e-02, -5.47217393e-04],
       [ 9.27889192e-01,  1.35566547e+00, -6.49981255e-01, ...,
        -1.94913051e-01, -4.31783126e-02,  1.73446277e-03],
       [-2.50242254e+00, -8.43529483e-01,  5.89109906e-01, ...,
         4.50587682e-02,  1.03607049e-01, -2.21172969e-02]])

In [38]:
X_reduce_train[0:1,:]

array([[-2.49569913e+00, -3.37762397e-01,  7.06391950e-01,
        -1.32791025e+00, -8.21824333e-01, -6.62790677e-01,
        -6.56764789e-01,  3.68093279e-02, -2.03665105e-01,
         1.76134815e-01, -9.20131987e-02,  2.40129020e-01,
        -3.60473661e-03, -3.41246327e-02,  4.32799605e-02,
         1.02996923e-01,  3.70733348e-03,  1.37933445e-03,
        -6.63814471e-03]])

In [40]:
# 1.bileşenin veri setinde bulunan toplam değişkenliğin %38'ini açıklamıştır.
# 2.bileşen ile 1.bileşenin ortak olrak açıkladığı oran ise %59'tur.
# Bileşen sayı artıkça açıklama oranı artıyordur.
np.cumsum(np.round(pca.explained_variance_ratio_,decimals = 4)*100)[0:10]

array([38.18, 59.88, 70.88, 78.88, 84.18, 88.45, 92.05, 94.86, 96.34,
       97.28])

In [45]:
# 5.bileşene gelindiğinde bile değişkenlerin %84'ü açıklanmış oluyor. Bu yüzden verimlilik
# açısından 5.bileşene kadar almamız yeterli olacaktır.
lm = LinearRegression()
pcr_model = lm.fit(X_reduce_train,y_train)
pcr_model.intercept_ # sabit

543.4834416243655

In [46]:
pcr_model.coef_ # katsayı

array([ 111.13977427,  -29.34209502,   26.29799759,  -38.47549852,
        -56.9200785 ,   54.44779423,   40.77493384,  -23.72746012,
          9.31198164,   13.02031672,   45.58357748,   31.97791627,
         18.93930958, -115.60940171,   24.00382778,  415.70806202,
       -449.51779543,  563.07375399,  302.53718462])

### Tahmin

In [47]:
y_pred = pcr_model.predict(X_reduce_train)
y_pred[0:5]

array([377.44484744, 802.19452124, 495.60987745, 112.53177731,
       426.21613066])

In [49]:
df["Salary"].mean()

535.9258821292775