# PCA "Principle Component Analysis"
## a technique to re_express complex data in terms of a few,well-chosen vectors(Priciple Components) that most efficiently capture the variation in that data in general PCA,there are as many principle components as there are dimensions in the original data



# Implementing a PCA using a matplotlib

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data


In [2]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.mlab import PCA


In [3]:
prices=pd.read_csv('data/stocks.csv')
prices.head()

Unnamed: 0,Date,AAPL,ADBE,CVX,GOOG,IBM,MDLZ,MSFT,NFLX,ORCL,SBUX
0,3-Jan-17,121.104935,113.82,110.759811,823.830017,174.420883,44.43,64.737526,141.220001,40.23,55.649239
1,1-Dec-16,115.320023,102.949997,116.584061,792.450012,164.687836,44.330002,61.765545,123.800003,38.299999,55.270943
2,1-Nov-16,110.0429,102.809998,110.502274,775.880005,160.947403,41.066032,59.896873,117.0,40.033211,57.709953
3,3-Oct-16,112.472404,107.510002,102.728424,809.900024,151.113403,44.75042,59.159267,124.870003,38.270115,52.588333
4,1-Sep-16,111.987015,108.540001,100.933739,804.059998,156.186905,43.714809,56.868721,98.550003,38.975769,53.648621


In [5]:
#to compute the Returns we need to order the data set by date
prices['Date']=pd.to_datetime(prices['Date'],infer_datetime_format=True)
prices.head()

Unnamed: 0,Date,AAPL,ADBE,CVX,GOOG,IBM,MDLZ,MSFT,NFLX,ORCL,SBUX
0,2017-01-03,121.104935,113.82,110.759811,823.830017,174.420883,44.43,64.737526,141.220001,40.23,55.649239
1,2016-12-01,115.320023,102.949997,116.584061,792.450012,164.687836,44.330002,61.765545,123.800003,38.299999,55.270943
2,2016-11-01,110.0429,102.809998,110.502274,775.880005,160.947403,41.066032,59.896873,117.0,40.033211,57.709953
3,2016-10-03,112.472404,107.510002,102.728424,809.900024,151.113403,44.75042,59.159267,124.870003,38.270115,52.588333
4,2016-09-01,111.987015,108.540001,100.933739,804.059998,156.186905,43.714809,56.868721,98.550003,38.975769,53.648621


In [6]:
prices=prices.sort_values(['Date'],ascending=[True])

In [7]:
#lets reduce the dimensions up front
prices=prices[['AAPL','GOOG','NFLX']]
prices.head()

Unnamed: 0,AAPL,GOOG,NFLX
120,11.107141,251.001007,3.258571
119,10.962033,224.949951,3.218571
118,12.037377,229.309311,3.312857
117,12.930043,235.925919,3.167143
116,15.701322,249.204208,3.128572


In [9]:
#convert to returns
returns=prices[[key for key in dict(prices.dtypes) if dict(prices.dtypes)[key] in ['float64','int64'] ]].pct_change()

In [10]:
returns= returns[1:]
returns.head()

Unnamed: 0,AAPL,GOOG,NFLX
119,-0.013064,-0.103789,-0.012275
118,0.098097,0.019379,0.029294
117,0.074158,0.028855,-0.043984
116,0.214329,0.056282,-0.012178
115,0.007014,0.049788,-0.114612


In [11]:
returns_arr=returns.as_matrix()[:20]
returns_arr.shape

(20, 3)

In [12]:
#and you have to standardize and scale the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

returns_arr_scaled=scaler.fit_transform(returns_arr)
print(returns_arr_scaled)

[[-0.26570419 -0.83085735 -0.27988663]
 [ 0.48183445  0.19850401  0.05469668]
 [ 0.32084764  0.27769246 -0.53510448]
 [ 1.26346797  0.50691119 -0.27910722]
 [-0.13068243  0.45264219 -1.10356751]
 [ 0.35775408 -0.16651438 -1.07769436]
 [ 0.16512878  0.12257604 -0.04561865]
 [ 0.55008778  0.88031173  1.30278588]
 [ 1.4206431   2.09513584  2.03764936]
 [-0.45151408 -0.1289488  -1.20580256]
 [ 0.40746153  0.01821392  1.04538738]
 [-2.30718683 -1.50058206 -0.62555009]
 [-0.69154794 -1.34257977  1.87670064]
 [ 0.8161854  -0.50816289  0.60136016]
 [ 1.24912054  2.57561617 -0.80129032]
 [ 0.39430968  0.20404444 -0.58880849]
 [-0.93708285 -0.8106076  -1.31840651]
 [-0.51882766 -0.79963904  1.30701792]
 [ 0.26976548 -0.14797953 -0.19411235]
 [-2.39406044 -1.0957766  -0.17064887]]


In [13]:
#now the above data we can feed into a PCA
results=PCA(returns_arr_scaled,standardize=False)


In [14]:
#fracs - will give you the proportion of variance of each of the priciple components
print(results.fracs)

[ 0.60781955  0.32166829  0.07051216]


In [15]:
#and weights of the results is what is used to reconstruct the original data
print(results.Wt)
#and the rows of the below vectors are eigen vectors


[[ 0.69753442  0.67658261  0.23596969]
 [-0.07444624 -0.25910276  0.96297638]
 [-0.71267348  0.68927623  0.13036406]]


In [16]:
#principle components vector and * weight vectors = original data
print(np.dot(results.Y,results.Wt))

[[-0.26570419 -0.83085735 -0.27988663]
 [ 0.48183445  0.19850401  0.05469668]
 [ 0.32084764  0.27769246 -0.53510448]
 [ 1.26346797  0.50691119 -0.27910722]
 [-0.13068243  0.45264219 -1.10356751]
 [ 0.35775408 -0.16651438 -1.07769436]
 [ 0.16512878  0.12257604 -0.04561865]
 [ 0.55008778  0.88031173  1.30278588]
 [ 1.4206431   2.09513584  2.03764936]
 [-0.45151408 -0.1289488  -1.20580256]
 [ 0.40746153  0.01821392  1.04538738]
 [-2.30718683 -1.50058206 -0.62555009]
 [-0.69154794 -1.34257977  1.87670064]
 [ 0.8161854  -0.50816289  0.60136016]
 [ 1.24912054  2.57561617 -0.80129032]
 [ 0.39430968  0.20404444 -0.58880849]
 [-0.93708285 -0.8106076  -1.31840651]
 [-0.51882766 -0.79963904  1.30701792]
 [ 0.26976548 -0.14797953 -0.19411235]
 [-2.39406044 -1.0957766  -0.17064887]]


# now create a PCA and compare that with above inbuilt PCA

In [9]:
n_inputs = 3
n_hidden = 2 #coding layer
n_outputs = n_inputs

learning_rate=0.01

In [10]:
tf.reset_default_graph()

In [11]:
X=tf.placeholder(tf.float32,shape=[None,n_inputs])

In [13]:
hidden = tf.layers.dense(X,n_hidden)
output = tf.layers.dense(hidden,n_outputs)