In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler # Normalizing data set
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.decomposition import PCA
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

<h2>Correlated Data with PCA</h2>
<h4>Demonstrate how PCA works with dataset containing related features</h4>
<br>
Input Features: 10 columns are related to each other. x,x^2,x^3,...<br>
Target Feature: y is sum of all input features for that sample<br>
Objective: <br><quote>1. Demonstrate how PCA is able to reduce the input dimensions<br>2. Train two Linear Regression models one with original features and another with PCA components.<br>Compare the outcome</quote>

In [None]:
# 1000 rows x 10 columns
np.random.seed(5)
# Generate 1000 random values between 0 and 1
random_data = np.random.rand(1000)

In [None]:
random_data.shape

In [None]:
random_data[:5]

In [None]:
# Add 10 columns. 
# x = random_data
# Columns: x,x^2,x^3,....
col_data = {}
for i in range(10):
    col_data[i] = random_data ** (i+1)

In [None]:
df = pd.DataFrame(col_data)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
# Normalize the dataset before running PCA
# x = (x-mean)/stddev
scaler = StandardScaler()

In [None]:
def transform_data(scaler, df, columns):
    transformed_data = scaler.transform(df[columns])
    df_transformed = pd.DataFrame(transformed_data, columns=columns)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]

In [None]:
scaler.fit(df)

In [None]:
transform_data(scaler, df, df.columns)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
y = df.sum(axis=1)

In [None]:
y.head()

In [None]:
df_pca = df.copy()

In [None]:
# Test PCA
# Two modes to test with PCA
# How many components we need in final output or how much variance do we need to capture as a percentage

pca = PCA(n_components=0.9) # percentage of variance to capture
#pca = PCA(n_components=2) # number of components

In [None]:
pca.fit(df_pca)

# number of components PCA came up with
pca.n_components_

In [None]:
def transform_with_pca(pca, df, columns):
    transformed_data = pca.transform(df[columns])
    
    tcols = []
    for i in range(pca.n_components_):       
        tcols.append('component_' + str(i))
    
    print ('components:',tcols)
    df_transformed = pd.DataFrame(transformed_data, columns=tcols)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]
    
    df.drop(columns, inplace=True, axis=1)
    
    return tcols

In [None]:
transform_with_pca(pca,df_pca,df_pca.columns)

In [None]:
df_pca.head()

In [None]:
# Compare X and y
# Compare First Component and y

In [None]:
plt.scatter(x=df[0],y=y,label='ideal fit')
plt.grid(True)
plt.xlabel('X')
plt.ylabel('Target')
plt.title('X vs y')
plt.legend()

In [None]:
plt.scatter(x=df_pca['component_0'],y=y,label='component 0')
plt.grid(True)
plt.xlabel('First Component')
plt.ylabel('Target')
plt.title('First Component vs y')
plt.legend()

In [None]:
# Train with actual data using Linear Regression Model
import sklearn.linear_model as lm

regressor = lm.LinearRegression()
regressor.fit(df,y)

In [None]:
y_predicted = regressor.predict(df)

In [None]:
plt.scatter(x=df[0],y=y,label='Actual')
plt.scatter(x=df[0],y=y_predicted,label='Predicted')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0}".format(metrics.mean_squared_error(y,y_predicted)**.5))

In [None]:
plt.boxplot([y,y_predicted], labels=['actual','predicted'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target')
plt.grid(True)

In [None]:
# Train with PCA Components using Linear Regression Model
plt.scatter(x=df_pca['component_0'],y=y,label='component 0')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()

In [None]:
regressor.fit(df_pca,y)

In [None]:
y_predicted_pca = regressor.predict(df_pca)

In [None]:
plt.scatter(x=df_pca['component_0'],y=y,label='Actual')
plt.scatter(x=df_pca['component_0'],y=y_predicted_pca,label='Predicted')
plt.grid(True)
plt.xlabel('Component')
plt.ylabel('Target')
plt.legend()

In [None]:
plt.scatter(x=df[0],y=y_predicted,label='X vs y')
plt.scatter(x=df[0],y=y_predicted_pca,label='X vs PCA Predicted')
plt.grid(True)
plt.xlabel('X')
plt.ylabel('Target')
plt.legend()

<h2>Summary</h2>
<br>
1. PCA works great when the columns are related to each other
2. Substantial reduction in dimension possible - real world datasets often have correlated columns
3. PCA loses original features - you cannot use Components and figure out how it maps to real world features 
4. PCA works for numeric data and data needs to be normalized