In [82]:
import pandas as pd
import numpy as np
import sklearn.decomposition as dec
import warnings
warnings.filterwarnings('ignore')

# Principal Component Analysis

## Read the data

In [83]:
train_data = pd.read_csv('../datasets/preprocessed/train.csv', sep=',')
test_data = pd.read_csv('../datasets/preprocessed/test.csv', sep=',')
print('Train data size ' + str(train_data.shape))
print('Test data size ' + str(test_data.shape))
data = pd.concat([train_data, test_data],ignore_index=False)
data.drop(columns=['Id'],inplace=True)

Train data size (1006, 48)
Test data size (432, 48)


In [84]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,...,Fence,MiscVal,SaleType,SaleCondition,SalePrice,MasVnr,SecondFloor,Baths,Porch,Pool
0,G,RH,0.185945,1.0,Lvl,Inside,Edwards,Artery,1Fam,2Story,...,0.0,0.0,WD,Normal,Level2,0.0,1.0,0.4,True,0.0
1,A,RL,0.19889,1.0,Lvl,Inside,NAmes,Norm,1Fam,1Story,...,1.0,0.0,WD,Family,Level2,1.0,0.0,0.0,True,0.0
2,L,RL,0.260616,1.0,Lvl,Corner,NridgHt,Norm,Twnhs,1Story,...,0.0,0.0,New,Partial,Level4,1.0,0.0,0.4,True,0.0
3,A,RL,0.25123,1.0,Lvl,Inside,NAmes,Norm,1Fam,1Story,...,1.0,0.0,WD,Abnorml,Level1,1.0,0.0,0.0,False,0.0
4,E,RL,0.174186,1.0,Lvl,Inside,SWISU,Norm,1Fam,1.5Fin,...,0.0,0.0,WD,Normal,Level2,0.0,1.0,0.4,True,0.0


## Perform PCA

In [85]:
pca = dec.PCA()
X = data.select_dtypes(include=[np.number])
X.head()

Unnamed: 0,LotArea,LotShape,OverallQual,YearBuilt,ExterQual,TotalBsmtSF,HeatingQC,CentralAir,GrLivArea,BedroomAbvGr,...,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageArea,Fence,MiscVal,MasVnr,SecondFloor,Baths,Pool
0,0.185945,1.0,0.555556,0.384058,0.333333,0.209607,0.5,0.0,0.249879,0.375,...,0.333333,0.333333,0.0,0.215827,0.0,0.0,0.0,1.0,0.4,0.0
1,0.19889,1.0,0.444444,0.637681,0.333333,0.315034,1.0,1.0,0.193143,0.25,...,0.333333,0.333333,0.0,0.182734,1.0,0.0,1.0,0.0,0.0,0.0
2,0.260616,1.0,0.888889,0.985507,1.0,0.53088,1.0,1.0,0.330275,0.125,...,1.0,0.416667,0.333333,0.756835,0.0,0.0,1.0,0.0,0.4,0.0
3,0.25123,1.0,0.444444,0.644928,0.333333,0.385215,0.5,1.0,0.217528,0.25,...,0.333333,0.333333,0.0,0.345324,1.0,0.0,1.0,0.0,0.0,0.0
4,0.174186,1.0,0.666667,0.405797,0.333333,0.268559,0.75,1.0,0.275954,0.375,...,0.333333,0.333333,0.333333,0.155396,0.0,0.0,0.0,1.0,0.4,0.0


In [86]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [87]:
print(pca.explained_variance_ratio_)

[0.25564357 0.20193523 0.16161059 0.11836411 0.0770537  0.03889735
 0.03436969 0.02321467 0.01844822 0.01651332 0.01022641 0.00916847
 0.00774689 0.00735747 0.00548387 0.00372641 0.00318777 0.00296715
 0.00209089 0.00123523 0.00075898]


In [88]:
print(pca.explained_variance_ratio_.cumsum())

[0.25564357 0.4575788  0.61918939 0.7375535  0.8146072  0.85350456
 0.88787425 0.91108892 0.92953714 0.94605046 0.95627687 0.96544534
 0.97319223 0.9805497  0.98603357 0.98975998 0.99294775 0.9959149
 0.99800579 0.99924102 1.        ]


In [89]:
print(pca.singular_values_)

[22.17968559 19.71260381 17.63489423 15.09204065 12.17684259  8.65163089
  8.13253197  6.68373544  5.95820126  5.63709247  4.43608081  4.20035905
  3.86101551  3.76272271  3.24849001  2.67783282  2.47674573  2.38950511
  2.00587244  1.54174163  1.20851711]


In [90]:
X_new = pca.transform(X)

In [91]:
np.set_printoptions(suppress=True)
pca.components_.T * np.sqrt(pca.explained_variance_)

array([[ 0.03580746,  0.0091717 ,  0.03155233,  0.00843272, -0.01240897,
         0.04197003,  0.04848173,  0.00645135,  0.06035143,  0.01038473,
        -0.02003434,  0.04514658,  0.05842062,  0.04427633, -0.00615815,
         0.00353218,  0.00753083, -0.00263482, -0.00325607, -0.00115262,
        -0.0005452 ],
       [-0.24598944, -0.01772718, -0.40216702, -0.05197136, -0.06665804,
         0.01386206,  0.01530959, -0.00479205,  0.00313804, -0.00532461,
        -0.001469  ,  0.00240624,  0.00473028,  0.00236944, -0.00037773,
         0.00007439, -0.00013956, -0.00027039,  0.00019108,  0.00009752,
         0.00009219],
       [ 0.10454963,  0.02794992, -0.01754658, -0.01313152, -0.06064987,
         0.03452143, -0.00042167, -0.01146064, -0.01035786,  0.0133865 ,
        -0.0218343 , -0.02703836, -0.01164998,  0.00262802, -0.04504197,
         0.04773733,  0.00825776, -0.00276393, -0.0086032 , -0.00180631,
        -0.00028881],
       [ 0.14981527, -0.02478137, -0.01104755, -0.03620914

In [92]:
len(X_new)

1438

In [93]:
data_new = pd.DataFrame(data=X_new[0:,0:6],
                        index=[i for i in range(len(X_new))],
                        columns=[i for i in range(6)])

In [94]:
data_new

Unnamed: 0,0,1,2,3,4,5
0,-0.753748,0.636941,-0.164025,-0.204247,0.549835,0.539191
1,-0.117306,-0.688384,-0.493263,0.796882,-0.008604,-0.239237
2,0.787114,-0.469485,-0.626444,-0.272131,-0.467151,0.305142
3,-0.172825,-0.708999,-0.480148,0.864432,0.145394,-0.106748
4,-0.488369,0.642514,-0.179286,-0.202405,0.023691,-0.091447
...,...,...,...,...,...,...
1433,-0.171814,-0.338120,0.821891,-0.260273,0.125090,-0.189923
1434,-1.004679,0.596731,-0.125332,0.722194,0.295212,0.767793
1435,-0.256558,-0.557312,-0.499222,-0.003437,1.051756,0.523491
1436,-0.526012,-0.374721,-0.061153,-0.341447,-0.036702,0.006344


In [95]:
train_data = data_new.loc[:1005]
test_data = data_new.loc[1006:]
train_data['Id'] = [x for x in range(len(train_data))]
test_data['Id'] = [x for x in range(len(test_data))]
print('Train data size ' + str(train_data.shape))
print('Test data size ' + str(test_data.shape))
train_data.to_csv('../datasets/preprocessed/train_pca.csv', index = None, header=True)
test_data.to_csv('../datasets/preprocessed/test_pca.csv', index = None, header=True)

Train data size (1006, 7)
Test data size (432, 7)
