# Análisis de Componentes Principales - Paso a Paso

* Estandarizar los datos (para cada una de las m observaciones).
* Obtener los vectores y valores propios a partir de la matriz de covarianzas o de correlaciones, o ncluso la técnica de *singular vector descomposition*.
* Ordenar los valores propios en orden descendente y quedarnos con los *p* que se correspondan a los *p* mayores, y así disminuir el número de variables del dataset (p < m).
* Construir la matriz de proyección W a partir de los p vectores propios.
* Transformar el dataset original X a través de W para así obtener datos en el subespacio dimensional de dimensión *p*, que será Y.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../datasets/iris/iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [60]:
colnames = df.columns.values.tolist()
X = df[colnames[:-1]].values
y = df[colnames[-1]].values

In [4]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [5]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

In [6]:
tls.set_credentials_file(username="DiegoGV95", api_key="1TOYCdq5tcWXmxS3lxRt")

In [8]:
traces = [] #Dibujo
legend = {0:True, 1:True, 2:True, 3:True}
colors = {
    "setosa": "rgb(255,127,30)",
    "versicolor": "rgb(31,220,120)",
    "virginica": "rgb(44,50,180)"
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=X[Y==key, col], opacity=0.7, xaxis="x%s"%(col+1),
                               marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {0:False, 1:False, 2:False, 3:False}
        
data = Data(traces) #Conjunto de datos
layout = Layout(barmode="overlay",
               xaxis = XAxis(domain=[0,0.25], title="Long. Sépalos (cm)"),
               xaxis2 = XAxis(domain=[0.3,0.5], title="Anch. Sépalos (cm)"),
               xaxis3 = XAxis(domain=[0.55,0.75], title="Long. Pétalos (cm)"),
               xaxis4 = XAxis(domain=[0.8,1], title="Anch . Pétalos (cm)"),
               yaxis = YAxis(title="Número de ejemplares"),
               title="Distribución de los rasgos de las diferentes flores Iris") #Distribución

fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Marker is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Marker
  - plotly.graph_objs.histogram.selected.Marker
  - etc.



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
X_std = StandardScaler().fit_transform(X)

In [13]:
traces = [] #Dibujo
legend = {0:True, 1:True, 2:True, 3:True}
colors = {
    "setosa": "rgb(255,127,30)",
    "versicolor": "rgb(31,220,120)",
    "virginica": "rgb(44,50,180)"
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=X_std[Y==key, col], opacity=0.7, xaxis="x%s"%(col+1),
                               marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {0:False, 1:False, 2:False, 3:False}
        
data = Data(traces) #Conjunto de datos
layout = Layout(barmode="overlay",
               xaxis = XAxis(domain=[0,0.25], title="Long. Sépalos (cm)"),
               xaxis2 = XAxis(domain=[0.3,0.5], title="Anch. Sépalos (cm)"),
               xaxis3 = XAxis(domain=[0.55,0.75], title="Long. Pétalos (cm)"),
               xaxis4 = XAxis(domain=[0.8,1], title="Anch . Pétalos (cm)"),
               yaxis = YAxis(title="Número de ejemplares"),
               title="Distribución de los rasgos de las diferentes flores Iris") #Distribución

fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Marker is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Marker
  - plotly.graph_objs.histogram.selected.Marker
  - etc.



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




### 1 - Calculamos la descomposición de valores y vectores propios
##### a) Usando la Matriz de Covarianzas

In [None]:
from IPython.display import display, Math, Latex

In [16]:
display(Math(r"\sigma_{jk} = \frac{1}{n-1}\sum{i=1}^m (x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})"))

<IPython.core.display.Math object>

In [17]:
display(Math(r"\Sigma = \frac{1}{n-1}((X-\overline{x})^T(X-\overline{x}))"))

<IPython.core.display.Math object>

In [18]:
display(Math(r"\overline{x} = \sum{i=1}^n x_i \in \mathbb R^m"))

<IPython.core.display.Math object>

In [19]:
import numpy as np

In [22]:
mean_vect = np.mean(X_std, axis=0)
mean_vect

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [26]:
#Matriz de covarianzas
cov_matrix = (X_std - mean_vect).T.dot((X_std - mean_vect)/X_std.shape[0]-1)
print("La matriz de covarianzas es:\n\n{}".format(cov_matrix))

La matriz de covarianzas es:

[[ 1.         -0.11756978  0.87175378  0.81794113]
 [-0.11756978  1.         -0.4284401  -0.36612593]
 [ 0.87175378 -0.4284401   1.          0.96286543]
 [ 0.81794113 -0.36612593  0.96286543  1.        ]]


In [28]:
#Matriz de covarianzas directamente con numpy
np.cov(X_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [29]:
#Valores propios y vectores propios
eig_values, eig_vectors = np.linalg.eig(cov_matrix)
print("Valores propios:\n{}".format(eig_values))
print("Vectores propios:\n{}".format(eig_vectors))

Valores propios:
[2.91849782 0.91403047 0.14675688 0.02071484]
Vectores propios:
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


##### b) Usando la Matriz de Correlaciones

In [30]:
corr_matrix = np.corrcoef(X_std.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [31]:
eig_values_corr, eig_vectors_corr = np.linalg.eig(corr_matrix)
print("Valores propios:\n{}".format(eig_values_corr))
print("Vectores propios:\n{}".format(eig_vectors_corr))

Valores propios:
[2.91849782 0.91403047 0.14675688 0.02071484]
Vectores propios:
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


In [32]:
corr_matrix = np.corrcoef(X.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

##### c) Singular Value Decomposition

In [34]:
u, s, v = np.linalg.svd(X_std.T)
u #Matriz de vectores propios

array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
       [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
       [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
       [-0.56485654, -0.06694199, -0.63427274,  0.52359713]])

In [35]:
s

array([20.92306556, 11.7091661 ,  4.69185798,  1.76273239])

### 2 - Las componentes principales

In [37]:
for ev in eig_vectors:
    print("La longitud del VP es: {}".format(np.linalg.norm(ev)))

La longitud del VP es: 1.0000000000000846
La longitud del VP es: 1.0000000000000049
La longitud del VP es: 1.0000000000000422
La longitud del VP es: 0.9999999999998683


In [40]:
eigen_pairs = [(np.abs(eig_values[i]), eig_vectors[:,i]) for i in range(len(eig_values))]
eigen_pairs

[(2.9184978165319424,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680618,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557129835,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642864015,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

Ordenamos los vectores propios con valor propio de mayor a menor

In [42]:
eigen_pairs.sort()
eigen_pairs.reverse()
eigen_pairs

[(2.9184978165319424,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9140304714680618,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14675687557129835,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.02071483642864015,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [43]:
print("Valores propios en orden descendente:")
for ep in eigen_pairs:
    print(ep[0])

Valores propios en orden descendente:
2.9184978165319424
0.9140304714680618
0.14675687557129835
0.02071483642864015


In [48]:
total_sum = sum(eig_values)
var_exp = [(i/total_sum)*100 for i in sorted(eig_values, reverse=True)]
cum_var_exp = np.cumsum(var_exp) #Suma acumulada de la varianza

In [47]:
var_exp

[72.9624454132996, 22.850761786701874, 3.6689218892825117, 0.5178709107160112]

In [49]:
cum_var_exp

array([ 72.96244541,  95.8132072 ,  99.48212909, 100.        ])

In [52]:
plot1 = Bar(x=["CP {}".format(i) for i in range(1,5)], y=var_exp, showlegend=False)
plot2 = Scatter(x=["CP {}".format(i) for i in range(1,5)], y=cum_var_exp, showlegend=True, name="% de Varianza Explicada Acumulada")

data = Data([plot1, plot2])

layout = Layout(xaxis=XAxis(title="Componentes principales"),
               yaxis = YAxis(title="Porcentaje de varianza explicada"),
               title = "Porcentaje de variabilidad explicada por cada componentes principal")

fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [53]:
W = np.hstack((eigen_pairs[0][1].reshape(4,1),
              eigen_pairs[1][1].reshape(4,1))) #Coloca uno debajo de otro
W

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

### 3 - Proyectando las variables en el nuevo subespacio vectorial

In [57]:
display(Math(r"Y = X \cdot W, X \in M(\mathbb R)_{150,4}, W \in M(\mathbb R)_{4,2}, Y \in M(\mathbb R)_{150,2}"))

<IPython.core.display.Math object>

In [58]:
Y = X_std.dot(W)

In [63]:
results = []

for name in ("setosa", "versicolor", "virginica"):
    result = Scatter(x=Y[y==name, 0], y=Y[y==name,1], mode="markers", name=name, 
                     marker=Marker(size=12, line=Line(color="rgba(220,220,220,0.15)", width=0.5),
                                  opacity=0.8))
    results.append(result)
    
data = Data(results)
layout = Layout(showlegend=True, scene=Scene(xaxis=XAxis(title="Componente Principal 1"),
                                             yaxis=YAxis(title="Componente Principal 2")))
fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.



plotly.graph_objs.Marker is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Marker
  - plotly.graph_objs.histogram.selected.Marker
  - etc.



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis



plotl