In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

votes_file = '../assets/datasets/votes.csv'
airport_file = '../assets/datasets/Airport_operations.csv'

# Case #1: Congressional Voting Data

After you've downloaded the data from the repository, go ahead and load it with Pandas

In [2]:
votes = pd.read_csv(votes_file, index_col=0)
votes.head()

Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
1,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
2,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
3,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
4,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
5,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


Next, let's define the x and y variables: 

In [3]:
X = votes.drop(['Class'],axis =1)

In [4]:
X.dropna(inplace=True)

In [5]:
X = X.applymap(lambda x: 1 if x == 'y' else 0)
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
6,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1
9,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
20,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1,1
24,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1,1
26,1,0,1,0,0,0,1,1,1,1,0,0,0,0,1,1


In [6]:
y = votes['Class']

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [7]:
X_standard = StandardScaler().fit_transform(X)

In [8]:
cov1_mat = np.cov(X_standard.T)
eigenValues1, eigenVectors1 = np.linalg.eig(cov1_mat)

Now, let's check the eigenvalues: 

In [9]:
eigenValues1

array([ 7.72441821,  1.38414615,  1.0518882 ,  0.9697194 ,  0.80765992,
        0.72917396,  0.10499427,  0.17814579,  0.20187395,  0.27597898,
        0.32292795,  0.35727403,  0.37876783,  0.49242232,  0.54322922,
        0.54664388])

And the eigenvectors: 

In [10]:
print eigenVectors1

[[  1.84608588e-01   1.79544628e-01   3.96877507e-02  -5.18236597e-01
   -4.58868451e-01   4.65381884e-01   1.79916607e-02  -8.13810996e-02
    5.44744281e-03   1.59364099e-01  -1.69450750e-01  -8.32207705e-02
    6.61375184e-02  -2.92779341e-01   2.76730743e-01   4.13800644e-02]
 [ -4.74282209e-02   6.48573848e-01  -1.18761132e-01  -3.73350711e-01
    9.30132071e-02  -4.61402638e-01  -1.75377517e-02  -6.80041937e-02
   -1.26056140e-01   4.14475796e-02   5.17595157e-04   1.22653581e-01
   -2.05367759e-01   6.86284905e-02   2.86201962e-02  -3.41722769e-01]
 [  2.86321599e-01   4.73091554e-02  -1.88931793e-01  -8.32651252e-02
    1.59714378e-01  -5.04457911e-02   1.35255869e-01  -1.33880242e-01
   -9.43254701e-02   2.12382774e-01   7.55148372e-01  -9.54559997e-02
   -6.08936441e-02  -1.21667214e-01   9.24259191e-02   3.85310015e-01]
 [ -3.04936211e-01  -1.42532176e-01   4.73997626e-02  -1.28224416e-01
   -1.50912081e-02   2.83604055e-02   3.15904889e-01   5.49808623e-02
    6.28745988e-0

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [11]:
eigenPairs1 = [(np.abs(eigenValues1[i]), eigenVectors1[:,i]) for i in range(len(eigenValues1))]
eigenPairs1.sort()
eigenPairs1.reverse()
for i in eigenPairs1:
    print(i[0])

7.72441821141
1.38414615236
1.05188819734
0.969719399932
0.807659919608
0.729173960116
0.546643881232
0.543229215866
0.492422322898
0.378767831509
0.357274030595
0.322927948142
0.275978979404
0.201873953601
0.178145794697
0.104994270551


Now, calculate the explained variance. Recall the methods we learned in lesson 2.2!

In [12]:
total1 = sum(eigenValues1)
var_exp1 = [(i/total1)*100 for i in sorted(eigenValues1, reverse=True)]
cum_var_exp1 = np.cumsum(var_exp1)

In [13]:
var_exp1

[48.069520658289782,
 8.6136250322063699,
 6.545963728073076,
 6.0346223433274657,
 5.0261164178178506,
 4.53769355568084,
 3.4017978600385206,
 3.3805481914049786,
 3.064373830537118,
 2.3570950721602193,
 2.2233378520306037,
 2.0096001083200732,
 1.7174338427323699,
 1.2562737953055541,
 1.1086120305738458,
 0.65338568150133569]

Now, calculate the explained variance and the Cumulative explained variance

In [14]:
print cum_var_exp1

[  48.06952066   56.68314569   63.22910942   69.26373176   74.28984818
   78.82754174   82.2293396    85.60988779   88.67426162   91.03135669
   93.25469454   95.26429465   96.98172849   98.23800229   99.34661432
  100.        ]


Now, conduct a PCA using scikit learn

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [15]:
pca1 = PCA(n_components=5)
pca1.fit(X_standard)
pca1.components_

array([[ 0.18460859, -0.04742822,  0.2863216 , -0.30493621, -0.32991001,
        -0.25972821,  0.27218234,  0.31864456,  0.30810139,  0.02190867,
         0.08078537, -0.30765177, -0.27068272, -0.27841627,  0.22803121,
         0.19344721],
       [ 0.17954463,  0.64857385,  0.04730916, -0.14253218,  0.01713514,
         0.11230851, -0.16949899, -0.03227914, -0.09331359, -0.41964081,
         0.49298729, -0.08521189,  0.1059062 , -0.15069798,  0.04605754,
        -0.10359913],
       [ 0.03968775, -0.11876113, -0.18893179,  0.04739976, -0.06747829,
        -0.29796101, -0.00475402, -0.03066313, -0.02291682, -0.71348674,
        -0.43404088,  0.06914848, -0.18091094, -0.14099933,  0.07168589,
        -0.30260842],
       [-0.5182366 , -0.37335071, -0.08326513, -0.12822442, -0.08360756,
         0.14797105, -0.11289754,  0.03011002,  0.042839  , -0.08024276,
         0.44673485,  0.04517012, -0.08438817,  0.02116993,  0.45898962,
        -0.3130494 ],
       [ 0.45886845, -0.09301321, -0

# Case #2: Airport Delays

In [2]:
airport = pd.read_csv(airport_file)
airport.head()

Unnamed: 0,airport,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,ABQ,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,ABQ,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,ABQ,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,ABQ,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,ABQ,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


First, let's define the x and y variables: Airport is going to be our target variable

In [3]:
X = airport.drop(['airport'],axis=1)
X.head()

Unnamed: 0,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


In [4]:
y = airport['airport']
y.head()

0    ABQ
1    ABQ
2    ABQ
3    ABQ
4    ABQ
Name: airport, dtype: object

Then, standardize the x variable for analysis

In [5]:
x_standard = StandardScaler().fit_transform(X)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [6]:
cov_mat = np.cov(x_standard.T)
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

Then, check your eigenvalues and eigenvectors:

In [7]:
eigenValues

array([  6.45349601e+00,   4.30952372e+00,   1.21824879e+00,
         7.37875394e-01,   5.10954558e-01,   3.49353222e-01,
         1.66799275e-01,   1.20220515e-01,   7.32036815e-02,
         6.41924376e-02,   9.59074587e-03,   2.34264664e-03,
         1.64364385e-04,   7.01306545e-04])

In [8]:
eigenVectors

array([[ -5.74761448e-02,   5.25255722e-02,   8.08909757e-01,
         -3.89143377e-01,  -1.51724877e-02,  -4.04558235e-01,
         -3.91819608e-02,  -1.31976220e-02,   1.43760537e-01,
         -7.77297605e-03,  -3.10742763e-02,  -2.93369275e-02,
          2.29607095e-03,   1.99299912e-03],
       [  1.16832935e-01,  -4.11909290e-01,   1.69808554e-01,
          4.02576136e-01,  -1.22740151e-01,  -1.12320737e-01,
          2.90266529e-01,   6.08203611e-02,   7.66594323e-02,
          3.32546414e-02,  -2.17851714e-02,   2.05754595e-02,
          7.07995242e-01,   3.17127569e-02],
       [  1.16407898e-01,  -4.11581912e-01,   1.71084289e-01,
          4.03871066e-01,  -1.25409356e-01,  -1.09894006e-01,
          2.91346645e-01,   6.55288643e-02,   9.06577905e-02,
          3.22166405e-02,  -3.61597627e-02,  -2.65374530e-02,
         -7.03727342e-01,  -3.44394613e-02],
       [ -3.14555883e-01,  -2.53530409e-01,  -1.18877111e-01,
         -2.24560820e-01,   1.23690522e-01,  -6.82727838e-0

To find the principal components, find the eigenpairs, and sort them from highest to lowest. 

In [11]:
eigenpairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range (len(eigenValues))]
eigenpairs.sort()
eigenpairs.reverse()
for i in eigenpairs:
    print(i[0])

6.45349600791
4.30952372449
1.21824878845
0.737875393589
0.510954557711
0.349353221606
0.166799275189
0.120220515247
0.0732036814776
0.0641924375551
0.0095907458676
0.00234264664452
0.000701306545385
0.00016436438493


Next, Calculate the explained variance

In [10]:
total = sum(eigenValues)
var_exp = [(i/total)*100 for i in sorted(eigenValues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

array([  46.04158864,   76.78729892,   85.47872904,   90.74300058,
         94.38833631,   96.8807493 ,   98.07075602,   98.92845292,
         99.45071462,   99.90868682,   99.9771107 ,   99.99382399,
         99.99882736,  100.        ])

Finally, conduct the PCA - use the results above to guide your selection of n components

In [12]:
pca = PCA(n_components=5)
pca.fit(x_standard)
print pca.components_

[[-0.05747614  0.11683294  0.1164079  -0.31455588 -0.37512831 -0.31944346
   0.3394426   0.20370273  0.17921314  0.38208105  0.26636432  0.11754539
   0.292669    0.3500623 ]
 [ 0.05252557 -0.41190929 -0.41158191 -0.25353041 -0.07870829 -0.25961871
   0.21205574 -0.35157249 -0.36563423  0.04418206 -0.1225854  -0.4016139
  -0.0920439   0.17689915]
 [ 0.80890976  0.16980855  0.17108429 -0.11887711 -0.10089244  0.01437277
   0.14698731  0.01467519 -0.07924788  0.11203106 -0.33705618  0.16814201
  -0.2829519   0.02122267]
 [ 0.38914338 -0.40257614 -0.40387107  0.22456082  0.0109185   0.0559417
  -0.12087295  0.43649882  0.42686047  0.07037202  0.19371794 -0.18814637
   0.05923421  0.0312063 ]
 [-0.01517249 -0.12274015 -0.12540936  0.12369052  0.13462459 -0.1428822
  -0.0589235   0.03961322  0.01006394 -0.06312055 -0.64493923  0.29971749
   0.60402007  0.18813453]]


**Bonus**: build and contrast classification models to predict the target classes for each problem, using both the original (or n-best) features and the principal components. If you arrive at <= 3-dimensional feature space, try plotting your data!