# Análisis de Componentes Principales - Paso a Paso

 * Estandarizar los datos (para cada una de las **m** observaciones).
 * Obtener los vectores y valores propios a partir de la matriz de covarianzas o de correlaciones o incluso la tecnica de singular vector decomposition.
 * Ordenar los valores propios en orden descendente y quedarnos con los **p** que se correspondan a los **p** mayores y asi disminuir el numero de variales del dataset **(p<m)**.
 * Construir la matriz de proyección **W** a partir de los **p** vectores propios.
 * Transformar el dataset original **X** a traves de **W** para asi obtener datos en el subespacio dimensional de dimesion **p** que seran **Y**.

In [50]:
import pandas as pd

In [51]:
df = pd.read_csv("../datasets/iris/iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [52]:
#Separacion del dataset
x = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [53]:
x[0]

array([5.1, 3.5, 1.4, 0.2])

In [54]:
y[0]

'setosa'

In [55]:
import chart_studio.plotly as py
from plotly.graph_objects import *
import chart_studio

import warnings
warnings.filterwarnings('ignore')

In [56]:
chart_studio.tools.set_credentials_file(username='axel.raoc', api_key='mH9tVdPllM0o8JfjH4Ld')

traces = []
legend = {0:True, 1:True, 2:True, 3:True}

colors = {'setosa': 'rgb(255,127,20)',
         'versicolor': 'rgb(31, 220, 120)',
         'virginica': 'rgb(44, 50, 180)'}

for col in range(4): 
    for key in colors:
        traces.append(Histogram(x=x[y==key, col], opacity = 0.7, 
                                xaxis="x%s"%(col+1), marker=Marker(color=colors[key]),
                               name = key, showlegend=legend[col]))
    legend = {0:False, 1:False, 2:False, 3:False}
        
data = Data(traces)
layout = Layout(barmode="overlay", 
                xaxis=XAxis(domain=[0,0.25], title="Long. Sépalos (cm)"),
                xaxis2=XAxis(domain=[0.3, 0.5], title = "Anch. Sépalos (cm)"),
                xaxis3=XAxis(domain = [0.55, 0.75], title = "Long. Pétalos (cm)"),
                xaxis4=XAxis(domain=[0.8,1.0], title = "Anch. Pétalos (cm)"),
                yaxis=YAxis(title="Número de ejemplares"),
                title="Distribución de los rasgos de las diferentes flores Iris")
fig = Figure(data = data, layout = layout)
py.iplot(fig)

In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
X_std = StandardScaler().fit_transform(x)

In [59]:
chart_studio.tools.set_credentials_file(username='axel.raoc', api_key='mH9tVdPllM0o8JfjH4Ld')

traces = []
legend = {0:True, 1:True, 2:True, 3:True}

colors = {'setosa': 'rgb(255,127,20)',
         'versicolor': 'rgb(31, 220, 120)',
         'virginica': 'rgb(44, 50, 180)'}

for col in range(4): 
    for key in colors:
        traces.append(Histogram(x=X_std[y==key, col], opacity = 0.7, 
                                xaxis="x%s"%(col+1), marker=Marker(color=colors[key]),
                               name = key, showlegend=legend[col]))
    legend = {0:False, 1:False, 2:False, 3:False}
        
data = Data(traces)
layout = Layout(barmode="overlay", 
                xaxis=XAxis(domain=[0,0.25], title="Long. Sépalos (cm)"),
                xaxis2=XAxis(domain=[0.3, 0.5], title = "Anch. Sépalos (cm)"),
                xaxis3=XAxis(domain = [0.55, 0.75], title = "Long. Pétalos (cm)"),
                xaxis4=XAxis(domain=[0.8,1.0], title = "Anch. Pétalos (cm)"),
                yaxis=YAxis(title="Número de ejemplares"),
                title="Distribución de los rasgos de las diferentes flores Iris")
fig = Figure(data = data, layout = layout)
py.iplot(fig)

## 1.- Calculamos la descomposición de valores y vectores propios
##### a) Usando la Matriz de Covarianzas

In [60]:
from IPython.display import display, Math, Latex

In [61]:
display(Math(r'\sigma_{jk} = \frac{1}{n-1}\sum_{i=1}^m (x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})'))

<IPython.core.display.Math object>

In [62]:
display(Math(r'\Sigma = \frac{1}{n-1}((X-\overline{x})^T(X-\overline{x}))'))

<IPython.core.display.Math object>

In [63]:
display(Math(r'\overline{x} = \sum_{i=1}^n x_i\in \mathbb R^m'))

<IPython.core.display.Math object>

In [64]:
import numpy as np

In [65]:
mean_vect = np.mean(X_std, axis=0)
mean_vect

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [66]:
cov_matrix = (X_std - mean_vect).T.dot((X_std - mean_vect))/(X_std.shape[0]-1)
print("La matriz de covarianzas es: \n%s" %cov_matrix)

La matriz de covarianzas es: 
[[ 1.00671141 -0.11835884  0.87760447  0.82343066]
 [-0.11835884  1.00671141 -0.43131554 -0.36858315]
 [ 0.87760447 -0.43131554  1.00671141  0.96932762]
 [ 0.82343066 -0.36858315  0.96932762  1.00671141]]


In [67]:
np.cov(X_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [68]:
eig_vals, eig_vectors = np.linalg.eig(cov_matrix)
print("Valores propios: \n%s" %eig_vals)
print("Vectores propios: \n%s" %eig_vectors)

Valores propios: 
[2.93808505 0.9201649  0.14774182 0.02085386]
Vectores propios: 
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


##### b) Usando la Matriz  de Correlaciones

In [69]:
corr_matrix = np.corrcoef(X_std.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [70]:
eig_vals, eig_vectors = np.linalg.eig(corr_matrix)
print("Valores propios: \n%s" %eig_vals)
print("Vectores propios: \n%s" %eig_vectors)

Valores propios: 
[2.91849782 0.91403047 0.14675688 0.02071484]
Vectores propios: 
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


##### c) Singular Value Decomposition

In [71]:
u, s, v = np.linalg.svd(X_std.T)
v

array([[ 1.08239531e-01,  9.94577561e-02,  1.12996303e-01, ...,
        -7.27030413e-02, -6.56112167e-02, -4.59137323e-02],
       [-4.09957970e-02,  5.75731483e-02,  2.92000319e-02, ...,
        -2.29793601e-02, -8.63643414e-02,  2.07800179e-03],
       [ 2.72186462e-02,  5.00034005e-02, -9.42089147e-03, ...,
        -3.84023516e-02, -1.98939364e-01, -1.12588405e-01],
       ...,
       [ 5.43380310e-02,  5.12936114e-03,  2.75184277e-02, ...,
         9.89532683e-01, -1.41206665e-02, -8.30595907e-04],
       [ 1.96438400e-03,  8.48544595e-02,  1.78604309e-01, ...,
        -1.25488246e-02,  9.52049996e-01, -2.19201906e-02],
       [ 2.46978090e-03,  5.83496936e-03,  1.49419118e-01, ...,
        -7.17729676e-04, -2.32048811e-02,  9.77300244e-01]])

## 2.- Las componentes principales

In [72]:
for ev in eig_vectors:
    print("La longitud del VP es: %s" %np.linalg.norm(ev))

La longitud del VP es: 0.9999999999999994
La longitud del VP es: 1.0
La longitud del VP es: 1.0000000000000002
La longitud del VP es: 1.0000000000000002


In [73]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vectors[:,i]) for i in range(len(eig_vals))]
eig_pairs

[(2.918497816531994,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680692,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557131487,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642861998,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

### Ordenamos los vectores propios con valor propio de mayor a menor

In [74]:
eig_pairs.sort()
eig_pairs.reverse()
eig_pairs

[(2.918497816531994,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680692,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557131487,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642861998,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [75]:
print("Valores propios en orden descendente:")
for ep in eig_pairs:
    print(ep[0])

Valores propios en orden descendente:
2.918497816531994
0.9140304714680692
0.14675687557131487
0.02071483642861998


In [76]:
total_sum = sum(eig_vals)
var_exp = [(i/total_sum)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [77]:
plot1 = Bar(x=["CP %s" %i for i in range(1,5)], y=var_exp, showlegend=False)
plot2 = Scatter(x=["CP %s" %i for i in range(1,5)], y=cum_var_exp, showlegend=True, 
                name="% de Varianza Explicada Acumulada")

data = Data([plot1, plot2])

layout = Layout(xaxis=XAxis(title="Componentes principales"), 
                yaxis=YAxis(title="Porcentaje de varianza explicada"), 
                title= "Porcentaje de variabilidad explicada por cada componente principal")

fig = Figure(data, layout)
py.iplot(fig)

In [78]:
W = np.hstack((eig_pairs[0][1].reshape(4,1), 
               eig_pairs[1][1].reshape(4,1)))
W

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [79]:
x[0]

array([5.1, 3.5, 1.4, 0.2])

## 3.- Proyectando las variables en el nuevo subespacio vectorial

In [80]:
display(Math(r'Y = X \cdot W, X \in M(\mathbb R)_{150, 4}, W \in M(\mathbb R)_{4,2}, Y \in M(\mathbb R)_{150, 2}'))

<IPython.core.display.Math object>

In [81]:
Y = X_std.dot(W)
Y

array([[-2.26470281, -0.4800266 ],
       [-2.08096115,  0.67413356],
       [-2.36422905,  0.34190802],
       [-2.29938422,  0.59739451],
       [-2.38984217, -0.64683538],
       [-2.07563095, -1.48917752],
       [-2.44402884, -0.0476442 ],
       [-2.23284716, -0.22314807],
       [-2.33464048,  1.11532768],
       [-2.18432817,  0.46901356],
       [-2.1663101 , -1.04369065],
       [-2.32613087, -0.13307834],
       [-2.2184509 ,  0.72867617],
       [-2.6331007 ,  0.96150673],
       [-2.1987406 , -1.86005711],
       [-2.26221453, -2.68628449],
       [-2.2075877 , -1.48360936],
       [-2.19034951, -0.48883832],
       [-1.898572  , -1.40501879],
       [-2.34336905, -1.12784938],
       [-1.914323  , -0.40885571],
       [-2.20701284, -0.92412143],
       [-2.7743447 , -0.45834367],
       [-1.81866953, -0.08555853],
       [-2.22716331, -0.13725446],
       [-1.95184633,  0.62561859],
       [-2.05115137, -0.24216355],
       [-2.16857717, -0.52714953],
       [-2.13956345,

In [82]:
results = []

for name in ('setosa', 'versicolor', 'virginica'):
    result = Scatter(x=Y[y==name,0], y = Y[y==name, 1], 
                    mode = "markers", name=name, 
                     marker=Marker(size = 12, line = Line(color='rgba(220,220,220,0.15)', width=0.5), opacity = 0.8))
    results.append(result)

data = Data(results)
layout = Layout(showlegend=True, scene =Scene(xaxis=XAxis(title="Componente Principal 1"),
                                             yaxis=YAxis(title="Componente Principal 2")))

fig = Figure(data=data, layout=layout)
py.iplot(fig)