## PCA

In [66]:
import numpy as np
import pandas as pd

In [67]:
data = {
    "x1" : [0.2, 0.45, 0.33, 0.54, 0.77],
    "x2" : [5.6, 5.87, 6.37, 7.9, 7.87],
    "x3" : [3.56, 2.4, 1.95, 1.32, 0.98],
    "y" : [100, 70, 60, 45, 30]
}

In [68]:
df = pd.DataFrame(data)
df

Unnamed: 0,x1,x2,x3,y
0,0.2,5.6,3.56,100
1,0.45,5.87,2.4,70
2,0.33,6.37,1.95,60
3,0.54,7.9,1.32,45
4,0.77,7.87,0.98,30


In [69]:
X = df[["x1","x2","x3"]]
X

Unnamed: 0,x1,x2,x3
0,0.2,5.6,3.56
1,0.45,5.87,2.4
2,0.33,6.37,1.95
3,0.54,7.9,1.32
4,0.77,7.87,0.98


In [70]:
X.mean()

x1    0.458
x2    6.722
x3    2.042
dtype: float64

In [71]:
X.std()

x1    0.216264
x2    1.097073
x3    1.011197
dtype: float64

In [72]:
X = (X - X.mean())
X

Unnamed: 0,x1,x2,x3
0,-0.258,-1.122,1.518
1,-0.008,-0.852,0.358
2,-0.128,-0.352,-0.092
3,0.082,1.178,-0.722
4,0.312,1.148,-1.062


In [73]:
X = (X/X.std())
X

Unnamed: 0,x1,x2,x3
0,-1.192988,-1.022721,1.501191
1,-0.036992,-0.776612,0.354036
2,-0.59187,-0.320854,-0.090981
3,0.379167,1.073766,-0.714005
4,1.442683,1.04642,-1.05024


In [74]:
print(X.mean())
print(X.std())

x1    0.000000e+00
x2    4.884981e-16
x3   -2.664535e-16
dtype: float64
x1    1.0
x2    1.0
x3    1.0
dtype: float64


In [75]:
# 분사을 구함
X_cov = np.dot(X.T, X) / 15
print(X_cov)

[[ 0.26666667  0.22370099 -0.23573601]
 [ 0.22370099  0.26666667 -0.24311486]
 [-0.23573601 -0.24311486  0.26666667]]


In [76]:
eigenvalue, eigenvector = np.linalg.eig(X_cov)

print('고유값: ', eigenvalue)
print('고유벡터: ', eigenvector)

고유값:  [0.73512513 0.04377061 0.02110426]
고유벡터:  [[ 0.56993665  0.7754158   0.27185024]
 [ 0.57618187 -0.61301734  0.54057764]
 [-0.58582135  0.15145983  0.79616158]]


In [77]:
sum(eigenvalue)

0.7999999999999996

In [78]:
eigenvalue[0]/sum(eigenvalue)

0.9189064087143994

In [79]:
z1 = X.dot(eigenvector.T[0])
z1

0   -2.148630
1   -0.675954
2   -0.468900
3    1.253065
4    2.040419
dtype: float64

In [80]:
from sklearn import linear_model

linear_regression = linear_model.LinearRegression()

linear_regression.fit(X = X, y = df["y"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [81]:
# 모든 독립변수로 선형회귀시 R_square
linear_regression.score(X = X, y = df["y"])

0.9978441052442628

In [82]:
linear_regression.fit(X = pd.DataFrame(z1), y = df["y"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [83]:
# z1 값만 이용해서 선형회귀시 R_square
# 별로 차이없음
linear_regression.score(X = pd.DataFrame(z1), y = df["y"])

0.959290206663969

## sklearn 을 사용한 PCA

In [84]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [85]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [86]:
print('고유값 : ', pca.singular_values_)
print('고유값 벡터 : \n', pca.components_.T)

고유값 :  [3.32067416 0.81028337 0.56264017]
고유값 벡터 : 
 [[-0.56993665  0.7754158  -0.27185024]
 [-0.57618187 -0.61301734 -0.54057764]
 [ 0.58582135  0.15145983 -0.79616158]]


In [92]:
z = pca.fit_transform(X)
z

array([[ 2.14863038, -0.07074582, -0.31801621],
       [ 0.6759543 ,  0.50101454,  0.1480055 ],
       [ 0.46889967, -0.27603645,  0.40678208],
       [-1.25306486, -0.47236838, -0.115067  ],
       [-2.04041949,  0.31813611, -0.12170436]])

In [93]:
linear_regression.fit(X = pd.DataFrame(z.T[0]), y = df["y"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [94]:
linear_regression.score(X = pd.DataFrame(z.T[0]), y = df["y"])

0.9592902066639692

## PCA를 이용한 보스턴 집값 분석

In [104]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [105]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()

boston = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [106]:
y = boston_dataset.target
y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [107]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(boston)

X = scaler.transform(boston)
X

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

In [108]:
X = pd.DataFrame(X, columns = boston_dataset.feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


In [40]:
from sklearn.decomposition import PCA
pca = PCA(n_components=13)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=13, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [52]:
print(pca.singular_values_)

[55.6793095  26.93022859 25.07516773 20.83105866 20.55278239 18.23864114
 16.45874174 14.15716218 11.83779223 10.55653065  9.70171478  9.25566343
  5.66883461]


In [55]:
print(pca.components_.T)

[[ 0.2509514  -0.31525237  0.24656649  0.06177071  0.08215692 -0.21965961
   0.77760721  0.15335048  0.26039028  0.01936913 -0.10964435  0.08676107
   0.0459523 ]
 [-0.25631454 -0.3233129   0.29585782  0.12871159  0.32061699 -0.3233881
  -0.27499628 -0.40268031  0.35813749  0.26752723  0.26275629 -0.07142528
  -0.08091897]
 [ 0.34667207  0.11249291 -0.01594592  0.01714571 -0.00781119 -0.0761379
  -0.33957645  0.17393172  0.64441615 -0.36353226 -0.30316943 -0.11319963
  -0.25107654]
 [ 0.00504243  0.45482914  0.28978082  0.81594136  0.08653094  0.16749014
   0.07413621 -0.02466215 -0.01372777 -0.00618184  0.01392667 -0.00398268
   0.03592171]
 [ 0.34285231  0.21911553  0.12096411 -0.12822614  0.13685356 -0.15298267
  -0.19963484  0.08012056 -0.01852201  0.23105645  0.11131888  0.80432257
   0.04363045]
 [-0.18924257  0.14933154  0.59396117 -0.28059184 -0.4234472   0.05926707
   0.06393992 -0.32675226  0.04789804 -0.43142019  0.05316154  0.15287286
   0.0455671 ]
 [ 0.3136706   0.3119777

In [42]:
pca.singular_values_[0]/sum(pca.singular_values_)

0.2273147909110368

In [43]:
sum(pca.singular_values_[0:7])/sum(pca.singular_values_)

0.7502376425336473

In [44]:
sum(pca.singular_values_[0:8])/sum(pca.singular_values_)

0.8080352767158058

In [56]:
sum(pca.singular_values_)/sum(pca.singular_values_)

1.0

In [45]:
z = pca.transform(X)
z

array([[-2.09829747,  0.77311275,  0.34294273, ..., -0.03300036,
         0.01944023,  0.36597533],
       [-1.45725167,  0.59198521, -0.69519931, ..., -0.64080983,
        -0.12579741, -0.07071949],
       [-2.07459756,  0.5996394 ,  0.1671216 , ..., -0.48755672,
         0.13332653, -0.0140218 ],
       ...,
       [-0.31236047,  1.15524644, -0.40859759, ...,  0.11565634,
         0.28196407,  0.06247358],
       [-0.27051907,  1.04136158, -0.58545406, ...,  0.0870124 ,
         0.30170082,  0.05436991],
       [-0.12580322,  0.76197805, -1.294882  , ...,  0.18432101,
         0.23273318,  0.01970872]])

In [46]:
z.shape

(506, 13)

In [47]:
z=z.T
z=z[0:8]

In [48]:
z=z.T
z.shape

(506, 8)

In [49]:
zDF=pd.DataFrame(z)
zDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-2.098297,0.773113,0.342943,-0.891774,0.42307,-0.315338,0.318641,0.295832
1,-1.457252,0.591985,-0.695199,-0.487459,-0.195876,0.264223,0.553861,-0.22367
2,-2.074598,0.599639,0.167122,-0.739204,-0.934534,0.448095,0.48456,0.105166
3,-2.611504,-0.006871,-0.100284,-0.343721,-1.104956,0.664649,0.622641,0.255941
4,-2.458185,0.097712,-0.075348,-0.427907,-1.065924,0.617047,0.705086,-0.134524


In [50]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(zDF,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [51]:
linear_regression.score(X = zDF, y = y)

0.7055977712036576

In [109]:
# 비교
linear_regression2 = LinearRegression()
linear_regression2.fit(X,y)
prediction = linear_regression2.predict(X = pd.DataFrame(X))
residuals = y - prediction
SSE = (residuals**2).sum()
SST = ((y-y.mean())**2).sum()
R_squared = 1 - SSE/SST
R_squared

0.7406426641094095