## PCA & IPCA test
### @author: Florent Pajot
### @version: 1.0
### @date: 09/11/2016

In [323]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import LabelEncoder
import os

def getFullSet():
    # Load data from the titanic Kaggle competition
    file_path_train = "C:\\Users\\fpajot\\Documents\\Code\\public-github\\data\\titanic_train.csv"
    file_path_test = "C:\\Users\\fpajot\\Documents\\Code\\public-github\\data\\titanic_train.csv"
    train_set = pd.read_csv(file_path_train, sep=",")
    test_set = pd.read_csv(file_path_train, sep=",")
    y_train = train_set.Survived
    train_set = train_set.assign(cat = lambda x:1)
    test_set = test_set.assign(cat = lambda x:0)
    full_set = pd.concat([train_set, test_set], axis=0)
    # Fill NAs or drop columns for the purpose of this test
    full_set.Embarked.fillna(value="S", inplace=True)
    age_mean_per_sex = full_set.groupby('Sex').Age.apply(np.mean)
    full_set.loc[full_set.Age.isnull() & (full_set.Sex == 'female'),'Age'] = age_mean_per_sex[0]
    full_set.loc[full_set.Age.isnull() & (full_set.Sex == 'male'),'Age'] = age_mean_per_sex[1]
    full_set.drop(['Cabin', 'Name', 'Ticket','PassengerId','Survived'], axis=1, inplace=True)
    # Encode labels
    encoder = LabelEncoder()
    full_set.Sex = encoder.fit_transform(full_set.Sex)
    full_set.Embarked = encoder.fit_transform(full_set.Embarked)
    return full_set

In [327]:
# WARNING: we shoud take care of normalizing and transforming skewed features before applying PCA
# however, for the purpose of this example we won't do it
# Normalizaing data
#from sklearn.preprocessing import normalize
full_set = getFullSet()
#full_set_norm = pd.DataFrame(normalize(full_set, axis=0), columns=full_set.columns.values)

In [335]:
# INFO: IPCA is an optimized version of PCA useful for large datasets (more memory efficient)

'''
@author: Florent Pajot
@version: 1.0
@params:
    - dataset (DataFrame): the input DataFrame to reduce using IPCA method
    - variance_threshold (float): the cumulated ratio of explained variance to keep when selecting principal components
    - verbose (int): 1 (default) for displaying principal components selection details, 0 otherwise (nothing displayed)
@result: returns a DataFrame of the resulting projection of the dataset in the new space
'''

# WARNING: it has to be carefully used to avoid information loss or distortion
# i.e. take care of normalizing features before applying PCA

def applyIPCA(dataset, variance_threshold, verbose=1):
    
    # parameters n_components set to default value min(n_features, n_samples)
    # and batch_size set to default value to 5 * n_features
    ipca = IncrementalPCA()
    X_ipca = ipca.fit_transform(dataset)
    
    if verbose == 1:
        print "--> Initial dimensions:", dataset.shape
        print "--> With columns:", dataset.columns.values
        print "\n-------------------------------------\n"
        print "--> Features projection in the new space:\n"
        print pd.DataFrame(ipca.components_, columns=["PC"+str(i) for i in range(ipca.components_.shape[1])], index=dataset.columns)
        print "\n--> Variance explained by each new component space:\n"
    
    cumulated_variance = [sum(ipca.explained_variance_ratio_[:i]) for i in range(ipca.components_.shape[1]+1)][1:ipca.components_.shape[1]+1]
    
    if verbose == 1:
        print pd.DataFrame([ipca.explained_variance_,ipca.explained_variance_ratio_,cumulated_variance], columns=["PC"+str(i) for i in range(ipca.components_.shape[1])], index=["explained_variace", "explained_variance_ratio", "cumulated_variance_explained"])

    # Feature selection using variance cutoff
    index_cutoff = ipca.components_.shape[1]
    stop = False
    while stop == False:
        if cumulated_variance[index_cutoff - 2] < variance_threshold:
            stop = True
        else:
            index_cutoff = index_cutoff - 1
    
    if verbose == 1:            
        print "\nNumber of components needed to reach the variance threshold:", index_cutoff
    
    ### 
    X_ipca_reduced = X_ipca[:,range(index_cutoff)]
    if verbose == 1:
        print "\n--> Final dimensions:", X_ipca_reduced.shape
    return pd.DataFrame(X_ipca_reduced, columns=dataset.columns.values[0:index_cutoff])

In [338]:
full_set_with_pca = applyIPCA(dataset=full_set, variance_threshold=0.999, verbose=1)
#full_set_with_pca = applyIPCA(dataset=full_set_norm, variance_threshold=0.98)

--> Initial dimensions: (1782, 8)
--> With columns: ['Pclass' 'Sex' 'Age' 'SibSp' 'Parch' 'Fare' 'Embarked' 'cat']

-------------------------------------

--> Features projection in the new space:

               PC0       PC1       PC2       PC3       PC4       PC5  \
Pclass   -0.009274 -0.001746  0.025031  0.003506  0.003487  0.999624   
Sex      -0.018047  0.004476  0.999196 -0.021709 -0.012779 -0.025059   
Age       0.125933 -0.044705  0.026961  0.888685  0.409523 -0.003577   
SibSp     0.069184  0.086390 -0.002613 -0.178919  0.007350  0.004961   
Parch     0.069028 -0.196558  0.004358 -0.414559  0.883073 -0.001689   
Fare      0.976795  0.153271  0.014445 -0.075180 -0.085180  0.009186   
Embarked  0.000360 -0.000292 -0.000007  0.000253 -0.000440  0.000006   
cat      -0.141593  0.963529 -0.004520 -0.015228  0.212102 -0.000489   

               PC6       PC7  
Pclass   -0.003580 -0.000002  
Sex      -0.000198  0.000015  
Age       0.154699 -0.000123  
SibSp     0.977575 -0.000081 

In [332]:
full_set_with_pca.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp
count,1782.0,1782.0,1782.0,1782.0
mean,-1.046675e-14,-1.789565e-15,-1.317689e-16,-3.918801e-17
std,49.69712,12.96205,1.127459,0.7632012
min,-32.47266,-31.78953,-2.614651,-1.769483
25%,-24.28631,-7.120523,-0.6082505,-0.4821712
50%,-17.81299,-0.1400369,-0.3620849,0.3906089
75%,-1.396369,5.212809,0.285913,0.4976388
max,480.1184,50.32011,7.343684,1.421686
