## Import Package

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression

## Load Dataset

In [None]:
boston = pd.read_csv('/content/drive/MyDrive/Materi Day 24/dataset/housing.csv')
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Data Preprocessing

In [None]:
# basic checking (cek missing value dan duplikat data)

boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [None]:
boston.duplicated().sum()

0

Aman, tidak ada missing value dan duplikat data

In [None]:
# selanjutnya memisahkan target

x = boston.drop('MEDV', axis=1)
y = boston['MEDV']

In [None]:
# scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_std = scaler.transform(x)

df = pd.DataFrame(data = x_std, columns = list(x))
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


## PCA Process

In [None]:
# menghitung covariance matrix

covariance_matrix = np.cov(x_std.T)
covariance_matrix

array([[ 1.0019802 , -0.20086619,  0.40738853, -0.05600226,  0.42180532,
        -0.21968085,  0.35343273, -0.38042191,  0.62674377,  0.5839183 ,
         0.29051973, -0.38582644,  0.4565237 ],
       [-0.20086619,  1.0019802 , -0.53488527, -0.04278127, -0.51762669,
         0.31260839, -0.57066514,  0.66572388, -0.31256554, -0.31518622,
        -0.39245415,  0.17586788, -0.41381239],
       [ 0.40738853, -0.53488527,  1.0019802 ,  0.06306266,  0.76516363,
        -0.39245145,  0.6460553 , -0.70942902,  0.59630775,  0.72218743,
         0.38400646, -0.35768342,  0.60499536],
       [-0.05600226, -0.04278127,  0.06306266,  1.0019802 ,  0.09138341,
         0.09143192,  0.0866891 , -0.09937217, -0.00738283, -0.03565699,
        -0.1217558 ,  0.0488851 , -0.05403609],
       [ 0.42180532, -0.51762669,  0.76516363,  0.09138341,  1.0019802 ,
        -0.30278658,  0.73291856, -0.77075334,  0.61265134,  0.66934602,
         0.1893068 , -0.38080321,  0.59204898],
       [-0.21968085,  0.312608

In [None]:
# mencari nilai Eigenvalue dan Eigenvector dari Covariance Matrix

eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

print('Eigenvectors:\n', eigen_vectors, '\n')
print('Eigenvalues:\n', eigen_values, '\n')

Eigenvectors:
 [[-0.2509514   0.31525237 -0.24656649 -0.06177071  0.08215692  0.21965961
  -0.77760721 -0.15335048 -0.0459523  -0.26039028  0.08676107  0.10964435
  -0.01936913]
 [ 0.25631454  0.3233129  -0.29585782 -0.12871159  0.32061699  0.3233881
   0.27499628  0.40268031  0.08091897 -0.35813749 -0.07142528 -0.26275629
  -0.26752723]
 [-0.34667207 -0.11249291  0.01594592 -0.01714571 -0.00781119  0.0761379
   0.33957645 -0.17393172  0.25107654 -0.64441615 -0.11319963  0.30316943
   0.36353226]
 [-0.00504243 -0.45482914 -0.28978082 -0.81594136  0.08653094 -0.16749014
  -0.07413621  0.02466215 -0.03592171  0.01372777 -0.00398268 -0.01392667
   0.00618184]
 [-0.34285231 -0.21911553 -0.12096411  0.12822614  0.13685356  0.15298267
   0.19963484 -0.08012056 -0.04363045  0.01852201  0.80432257 -0.11131888
  -0.23105645]
 [ 0.18924257 -0.14933154 -0.59396117  0.28059184 -0.4234472  -0.05926707
  -0.06393992  0.32675226 -0.0455671  -0.04789804  0.15287286 -0.05316154
   0.43142019]
 [-0.3136

In [None]:
# menampilkan eigenvalues dalam bentuk rasio

variance_explained = []
for i in eigen_values:
    variance_explained.append((i/sum(eigen_values))*100)
print(variance_explained)

[47.12960635727468, 11.02519324758802, 9.558589792622927, 6.596731600836161, 6.4216610521373605, 5.056978272291275, 4.118123739654422, 3.0469024189856633, 0.488532772626724, 2.1303333054001587, 1.3023305792840114, 1.4308797464566017, 1.6941371148420055]


Ada 13 pc dan dari 13 pc ini akan direduksi menjadi beberapa pc saja yang akan digunakan untuk masuk ke proses modelling. Jika kita menggunakan pc1, maka data original yang tercover hanya sekitar 47% saja sedangkan berdasarkan teori minimal data original yang tercover setelah direduksi = 80%.
Agar lebih mudah, kita coba mengidentifikasi komponen-komponen yang mengcover data original hingga 80%

In [None]:
cumulative_variance_explained = np.cumsum(variance_explained)

print(cumulative_variance_explained)

[ 47.12960636  58.1547996   67.7133894   74.310121    80.73178205
  85.78876032  89.90688406  92.95378648  93.44231925  95.57265256
  96.87498314  98.30586289 100.        ]


Dari perhitungan di atas kita tahu bahwa 5 pc pertama sudah mengcover 80% data original, namun saat ini saya ingin menggunakan 10 pc sehingga data original yang tercover = 95.57265256%

In [None]:
projection_matrix = (eigen_vectors.T[:][:10]).T
print(projection_matrix)

[[-0.2509514   0.31525237 -0.24656649 -0.06177071  0.08215692  0.21965961
  -0.77760721 -0.15335048 -0.0459523  -0.26039028]
 [ 0.25631454  0.3233129  -0.29585782 -0.12871159  0.32061699  0.3233881
   0.27499628  0.40268031  0.08091897 -0.35813749]
 [-0.34667207 -0.11249291  0.01594592 -0.01714571 -0.00781119  0.0761379
   0.33957645 -0.17393172  0.25107654 -0.64441615]
 [-0.00504243 -0.45482914 -0.28978082 -0.81594136  0.08653094 -0.16749014
  -0.07413621  0.02466215 -0.03592171  0.01372777]
 [-0.34285231 -0.21911553 -0.12096411  0.12822614  0.13685356  0.15298267
   0.19963484 -0.08012056 -0.04363045  0.01852201]
 [ 0.18924257 -0.14933154 -0.59396117  0.28059184 -0.4234472  -0.05926707
  -0.06393992  0.32675226 -0.0455671  -0.04789804]
 [-0.3136706  -0.31197778  0.01767481  0.17520603  0.01669085  0.07170914
  -0.11601071  0.60082292  0.03855068  0.06756218]
 [ 0.32154387  0.34907     0.04973627 -0.21543585  0.09859225 -0.02343872
   0.10390044  0.12181198  0.01829854  0.15329124]
 [

In [None]:
# menampilkan pc hasil reduksi

boston_pca = x.dot(projection_matrix)
boston_pca.columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10']
boston_pca['MEDV'] = boston['MEDV']
boston_pca.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,MEDV
0,-38.890181,-32.935324,51.873961,-92.167019,-178.427648,382.499065,63.253078,32.971896,-206.962371,64.423458,24.0
1,-33.023432,-54.798669,71.207997,-83.321389,-176.841195,367.268772,41.167341,40.135316,-167.304566,59.975715,21.6
2,-26.538735,-48.768409,67.853639,-85.185186,-178.071394,362.405072,43.914223,27.50183,-167.918899,57.556203,34.7
3,-12.756987,-47.783518,72.338822,-86.619758,-177.275189,358.347339,37.954676,20.639702,-154.655918,56.427659,33.4
4,-15.652406,-50.778719,73.709208,-85.656877,-177.036592,360.89995,36.574611,26.755055,-154.388828,57.546991,36.2


Dari 13 pc sudah direduksi menjadi 10 pc, dimana 10 pc ini mengcover sekitar 95% data original

## Modelling

In [None]:
# kita coba menggunakan model Decision Tree

from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split

In [None]:
x_new = boston_pca.drop('MEDV', axis = 1)
y_new = boston_pca['MEDV']

x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(x_new, y_new, test_size = 0.25, random_state = 42)

model_pca = DecisionTreeRegressor()
model_pca.fit(x_train_pca, y_train_pca)

In [None]:
# kita coba bandingkan train dan test pca dengan data asli

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

model = DecisionTreeRegressor()
model.fit(x_train, y_train)

In [None]:
# Predicting

y_pred_pca = model_pca.predict(x_test_pca)

y_pred = model.predict(x_test)

In [None]:
# Calculate the RMSE

from sklearn.metrics import mean_squared_error

# RMSE with PCA
mse_pca = mean_squared_error(y_test_pca, y_pred_pca)
rmse_pca = np.sqrt(mse_pca)
print('RMSE with PCA:', rmse_pca)

# RMSE with no PCA
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print('RMSE with no PCA:', rmse)

RMSE with PCA: 7.432509988180941
RMSE with no PCA: 4.673968790221914


Berdasarkan nilai evaluasi model di atas, kita mengetahui bahwa nilai RMSE with PCA lebih besar dibandingkan nilai RMSE with no PCA. Secara umum, semakin kecil nilai RMSE maka makin bagus performa suatu model. Dalam kasus ini, maka performa model Decision Tree Regressor tanpa PCA lebih baik dibandingkan menggunakan PCA.