In [235]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [236]:
df = pd.read_csv("train_cleaned.csv")

# Observe numeric variables for PCA 

In [237]:
df.select_dtypes(include=[np.number]).head().columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
       'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'PoolQC', 'Fence', 'MiscVal', 'MoSold', 'YrSold', 'TotalSF',
       'SalePrice'],
      dtype='object')

We will include the TotRmsAbbvGrd even though it may be an ordinal variable the range of the values is quite wide. 
Year built and remodel date can be changed to the age of the building and the number of years it has been remodeled 

In [238]:
import datetime
df["Age"] = datetime.datetime.now().year-df['YearBuilt'] 
df["RemodAge"] = datetime.datetime.now().year-df['YearRemodAdd']

In [239]:
numeric = df[['LotFrontage', 'LotArea','MasVnrArea', 
   'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
   '1stFlrSF', 'LowQualFinSF','GrLivArea','TotRmsAbvGrd',
   'GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF',
   'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
   'MiscVal','TotalSF','Age','RemodAge']]

#23 numerical predictors

# Standardize Data for PCA 
* For PCA the variables should be centered to have mean zero.
* In addition, the results of PCA will also depend on whether the variables have been individually scaled

In [240]:
from sklearn.preprocessing import StandardScaler
numeric = StandardScaler().fit_transform(numeric)
numeric.shape #(n_samples, n_features/predictors)

(1458, 23)

In [241]:
from sklearn.decomposition import PCA

#Dont set number of components so all components are kept
pca = PCA()
#fit and trasnform pca/ Eigen Decomposition/ dimension reduction
pcahouse = pca.fit_transform(numeric)
pcaloadings = pd.DataFrame(pcahouse.transpose())
pcaloadings18 = pcaloadings.iloc[:, [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]
pcaloadings18.columns = ["PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","PC15","PC16", "PC17", "PC18"]
#pcaloadings15
print (pca.explained_variance_)
print (pca.explained_variance_ratio_)
print (pca.explained_variance_ratio_.cumsum())

[  5.49218951e+00   1.96714091e+00   1.69598740e+00   1.19122664e+00
   1.13891032e+00   1.10729839e+00   1.07407664e+00   1.03147727e+00
   9.97882158e-01   9.77485122e-01   8.96993061e-01   8.79831290e-01
   8.45395020e-01   7.56112698e-01   6.90281544e-01   6.60184051e-01
   6.15394363e-01   3.19281604e-01   3.02379222e-01   2.01794546e-01
   1.58678237e-01   4.77336878e-31   2.94624172e-32]
[  2.38790848e-01   8.55278655e-02   7.37385828e-02   5.17924628e-02
   4.95178402e-02   4.81434080e-02   4.66989844e-02   4.48468376e-02
   4.33861808e-02   4.24993531e-02   3.89996983e-02   3.82535343e-02
   3.67563052e-02   3.28744651e-02   3.00122410e-02   2.87036544e-02
   2.67562767e-02   1.38818089e-02   1.31469227e-02   8.77367591e-03
   6.89905379e-03   2.07537773e-32   1.28097466e-33]
[ 0.23879085  0.32431871  0.3980573   0.44984976  0.4993676   0.54751101
  0.59420999  0.63905683  0.68244301  0.72494236  0.76394206  0.8021956
  0.8389519   0.87182637  0.90183861  0.93054226  0.9572985

# Choosing the number of components 
* based on the elbow of the scree plot
* Kaiser Criterion (Eigenvalues < 1 since we have already scaled the data)
* Or if we just to retain 90% of the variability of the data set

In [242]:
# By observing the variance explained cumsum we see that inorder to keep
# at least 90% of the variability we need to keep at least up to the 15th principal component
print (pca.explained_variance_ratio_.cumsum())

[ 0.23879085  0.32431871  0.3980573   0.44984976  0.4993676   0.54751101
  0.59420999  0.63905683  0.68244301  0.72494236  0.76394206  0.8021956
  0.8389519   0.87182637  0.90183861  0.93054226  0.95729854  0.97118035
  0.98432727  0.99310095  1.          1.          1.        ]


In [243]:
# Principal component loadings 
pcaloadings18['Predictors'] = ['LotFrontage', 'LotArea','MasVnrArea', 'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF', 'LowQualFinSF','GrLivArea','TotRmsAbvGrd','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','TotalSF','Age','RemodAge']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [244]:
# Now we can observe the loadings 
pcaloadings18.set_index("Predictors")


Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18
Predictors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LotFrontage,0.1917055,0.3960037,-1.316549,3.027847,-0.612567,3.01626,1.320555,-1.084522,-2.414237,-1.563405,3.253526,-1.946871,2.67608,-0.3278346,-1.895488,-1.249326,-2.41924,0.07546319
LotArea,-1.058227,-1.033833,2.478953,-0.3551621,-1.75434,-1.615046,1.184817,2.743441,-0.3275327,-0.928584,-0.2557448,-0.9083198,-0.3823285,0.765595,-0.4903892,-0.8176136,-0.1919854,-1.491893
MasVnrArea,1.316307,-0.5688897,0.2735379,-0.2212002,0.5518048,1.079345,1.228839,-0.448763,1.340598,1.292265,0.3774302,1.78053,-2.032974,0.726488,-1.419072,0.528868,0.2658972,-0.3718549
BsmtFinSF1,-0.03507633,0.4876609,-0.9690546,1.196439,-1.613165,0.4558743,0.8167544,-1.04396,-0.4064397,-0.508153,1.679869,-0.4415994,-1.170054,-0.7061566,-0.1553638,-0.6777042,0.1887403,0.1394938
BsmtFinSF2,-0.7909598,0.4005323,0.9522797,0.3771106,-1.47969,-1.179706,0.1788657,0.4330019,-0.5996365,-0.3772399,0.120711,0.8561996,-0.0009755574,-0.2755081,0.737428,0.09177491,1.207747,0.1567309
BsmtUnfSF,-0.5577611,-0.3378907,-2.119828,-1.034432,-1.800876,-0.1309659,-1.361766,-1.596906,0.2201631,-0.05479843,-1.368461,1.593903,-0.2011428,0.09238275,0.2980682,-0.1948107,-0.9560401,0.3222712
TotalBsmtSF,0.4615677,-0.3903079,-0.8610253,-0.3744263,-2.747326,0.02051396,-0.4644854,-0.3727211,-0.8403717,-0.632261,-0.8039117,-1.006732,0.6219668,-0.9282107,0.3548512,-0.6778218,-0.3878083,0.08638569
1stFlrSF,-0.2160257,0.1009997,-1.250887,-0.252844,8.731598,-0.4191935,-0.8309883,-1.357792,-0.4486358,-0.0360227,-0.3318885,-0.009108828,0.009441956,-0.7490913,0.1608126,0.5565281,0.7215581,0.4390062
LowQualFinSF,0.1792506,-0.1570751,-0.04885722,-0.06801616,-3.507778,0.2420857,0.4337867,0.3105137,0.1056805,-0.04165203,0.00943548,0.07306887,0.2319143,-0.1278615,-0.2657411,1.163677,0.4016466,-0.2461682
GrLivArea,0.08260919,-0.1069869,1.048925,-0.7150675,-0.3225081,0.4973535,0.6690477,0.7694439,0.2814921,0.1674336,-0.691261,-0.1097981,0.1189143,1.060783,0.169127,0.02175466,-0.7488589,0.1661554


# Observing Principal Component Loadings 
## PC1
* PC1 has large magnitudes in LotArea and MasVnrArea probabily explaining the size of the house

## PC2
* LotArea has the largest magnitude and BsmtFinSF1,BsmtFinSF2,MasVnrArea,TotalBsmtSF etc 
* This most likely tells us PC2 is explaining the basement size

## Since we may need to look at 15 principal component to retain 90% of the variability we would skip the interpretation for each PCs

# PCA Regression 
* We only want 15 PC's so we will select the specific columns

In [269]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

x = pcaloadings18.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]]
x = x.transpose()

In [270]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score