<a href="https://colab.research.google.com/github/AnabelBerumen/DataScientist/blob/main/modelos/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Análisis de componentes principales - Paso a Paso

* Estandarizar los datos para cada una de las m observaciones
* Otener los vectores y valores a partit de la matriz  de covarianza o de correlaciones, incluso la técnica de singular vector decomposition.
* Ordenar los valores propios en orden descendente y quedarnos con los *p* que se correspondan a los *p* mayores y así disminuir el número de variables del dataset (p<m)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/iris.csv')
df.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [None]:
X = df.iloc[:,0:4].values
Y = df.iloc[:,4].values

In [None]:
!pip install chart-studio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import chart_studio.plotly as py

import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler



# Necesario para exportar los gráficos, y hace falta registrarse:

import chart_studio

chart_studio.tools.set_credentials_file(username="", api_key="")



# Normalizamos los datos para centrarlos en 0

X_std = StandardScaler().fit_transform(X)



# Representamos los histogramas

traces = []

data = []



legend = {0:True, 1:True, 2:True, 3:True}



colors = {'setosa': 'rgb(255,127,20)',

         'versicolor': 'rgb(31, 220, 120)',

         'virginica': 'rgb(44, 50, 180)'

         }



for col in range(4):

    for key in colors:

        traces.append(go.Histogram(x=X_std[Y==key, col], opacity = 0.7,

                                   xaxis="x%s"%(col+1), marker=go.histogram.Marker(color=colors[key]),

                                   name = key, showlegend=legend[col]))



    legend = {0:False, 1:False, 2:False, 3:False}



for x in range(len(traces)):

    data.append(go.Histogram(traces[x]))

layout = go.Layout(barmode="overlay",

                xaxis=go.layout.XAxis(domain=[0,0.25], title="Long. Sépalos (cm)"),

                xaxis2=go.layout.XAxis(domain=[0.3, 0.5], title = "Anch. Sépalos (cm)"),

                xaxis3=go.layout.XAxis(domain = [0.55, 0.75], title = "Long. Pétalos (cm)"),

                xaxis4=go.layout.XAxis(domain=[0.8,1.0], title = "Anch. Pétalos (cm)"),

                yaxis=go.layout.YAxis(title="Número de ejemplares"),

                title="Distribución de los rasgos de las diferentes flores Iris")



fig = go.Figure(data = data, layout = layout)

fig.show()

## 1- Calculamos la descomposición de valores y vectores propios

In [None]:
from IPython.display import display, Math, Latex

In [None]:
display(Math(r'\sigma_{jk} = \frac{1}{n-1}\sum_{i=1}^m (x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})'))

<IPython.core.display.Math object>

In [None]:
# matriz de covarianza
# cov_matrix
display(Math(r'\Sigma =\frac{1}{n-1}((X-\overline{x})^T(X-\overline{x}))'))

<IPython.core.display.Math object>

In [None]:
display(Math(r'\overline{x} = \sum_{i=1}^n x_i\in \mathbb R^m'))

<IPython.core.display.Math object>

In [None]:
import numpy as np

In [None]:
mean_vect = np.mean(X_std, axis=0) # promedio de las 4 variables
mean_vect

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [None]:
cov_matrix = (X_std - mean_vect).T.dot((X_std - mean_vect)) /( X_std.shape[0]-1)
print('La matrix de covarianzas es \n%s'%cov_matrix)

La matrix de covarianzas es 
[[ 1.00671141 -0.11835884  0.87760447  0.82343066]
 [-0.11835884  1.00671141 -0.43131554 -0.36858315]
 [ 0.87760447 -0.43131554  1.00671141  0.96932762]
 [ 0.82343066 -0.36858315  0.96932762  1.00671141]]


In [None]:
np.cov(X_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [None]:
eig_vals, eig_vectors = np.linalg.eig(cov_matrix)
print('Valores propios\n%s'%eig_vals)
print('Vectores propios\n%s'%eig_vectors)

Valores propios
[2.93808505 0.9201649  0.14774182 0.02085386]
Vectores propios
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


b) Usando la matriz de correlaciones

In [None]:
corr_matrix = np.corrcoef(X_std.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [None]:
eig_vals_corr, eig_vectors_corr = np.linalg.eig(corr_matrix)
print('Valores propios\n%s'%eig_vals_corr)
print('Vectores propios\n%s'%eig_vectors_corr)

Valores propios
[2.91849782 0.91403047 0.14675688 0.02071484]
Vectores propios
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


In [None]:
# esto seria sin estandarizar
corr_matrix = np.corrcoef(X.T)
corr_matrix

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

la descompocion de la matriz de covarianza estandarizada es igual a la matriz de coorelacion con o sin estandarizacion


c) singular value decomposition

In [None]:
u, s, v = np.linalg.svd(X_std.T)
u

array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
       [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
       [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
       [-0.56485654, -0.06694199, -0.63427274,  0.52359713]])

2.- Las componetes principales

In [None]:
for ev in eig_vectors:
  print('La longitud del vector propio(VP) es: %s'%np.linalg.norm(ev))

La longitud del vector propio(VP) es: 0.9999999999999999
La longitud del vector propio(VP) es: 1.0000000000000002
La longitud del vector propio(VP) es: 0.9999999999999999
La longitud del vector propio(VP) es: 1.0


In [None]:
eigen_pairs = [(np.abs(eig_vals[i]), eig_vectors[:, i]) for i in range(len(eig_vals))]
eigen_pairs

[(2.9380850501999953,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624852,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14774182104494768,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176463143,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

ordenamos los vectores propios con valor propio de mayor a menor

In [None]:
eigen_pairs.sort() # los ordena de mayor a menor
eigen_pairs.reverse() # lo regresa a su orden
eigen_pairs

[(2.9380850501999953,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624852,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.14774182104494768,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176463143,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [None]:
print('Valores propios en orden descendente')
for ep in eigen_pairs:
  print(ep[0])

Valores propios en orden descendente
2.9380850501999953
0.9201649041624852
0.14774182104494768
0.020853862176463143


In [None]:
total_sum = sum(eig_vals)
var_exp = [(i/total_sum) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [None]:
plot1 = go.Bar(x=['CP %s'%i for i in range(1, 5)], y=var_exp, showlegend=False)
plot2 = go.Scatter(x=['CP %s'%i for i in range(1,5)], y=cum_var_exp, showlegend=True, name='% Varianza Explicada acumulada')

data = []
data.append(plot1); data.append(plot2)

layout = go.Layout(xaxis = go.layout.XAxis(title='Componentes principales'),
                yaxis = go.layout.YAxis(title='Porcentaje de varianza explicada'),
                title='Porcentaje de variabilidad explicada por cada componente principal')

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
W = np.hstack((eigen_pairs[0][1].reshape(4,1),
               eigen_pairs[1][1].reshape(4,1)))
W # Matriz 4 x 2

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [None]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

3- Proyectando las variables en el nuevo subespacio vectorial

In [None]:
 display(Math(r'Y = X \cdot W, X \in M(\mathbb R)_{150, 4}, W \in M(\mathbb R)_{4,2}, Y \in (\mathbb R)_{150,2}'))

<IPython.core.display.Math object>

In [None]:
Y = X_std.dot(W)
Y

array([[-2.26470281, -0.4800266 ],
       [-2.08096115,  0.67413356],
       [-2.36422905,  0.34190802],
       [-2.29938422,  0.59739451],
       [-2.38984217, -0.64683538],
       [-2.07563095, -1.48917752],
       [-2.44402884, -0.0476442 ],
       [-2.23284716, -0.22314807],
       [-2.33464048,  1.11532768],
       [-2.18432817,  0.46901356],
       [-2.1663101 , -1.04369065],
       [-2.32613087, -0.13307834],
       [-2.2184509 ,  0.72867617],
       [-2.6331007 ,  0.96150673],
       [-2.1987406 , -1.86005711],
       [-2.26221453, -2.68628449],
       [-2.2075877 , -1.48360936],
       [-2.19034951, -0.48883832],
       [-1.898572  , -1.40501879],
       [-2.34336905, -1.12784938],
       [-1.914323  , -0.40885571],
       [-2.20701284, -0.92412143],
       [-2.7743447 , -0.45834367],
       [-1.81866953, -0.08555853],
       [-2.22716331, -0.13725446],
       [-1.95184633,  0.62561859],
       [-2.05115137, -0.24216355],
       [-2.16857717, -0.52714953],
       [-2.13956345,

In [None]:
import plotly.graph_objects as go # Para hacer graficos con plotly
 
results = []
 
for name in ('setosa', 'versicolor', 'virginica'):
    
    result = go.Scatter(
        x = Y[y==name,0], 
        y = Y[y==name, 1],
        mode = "markers",
        name = name,
        
        marker = go.scatter.Marker(size=12, 
                                   line = go.scatter.marker.Line(
                                                color="rgba(225,225,225,0.2)",
                                                width=0.5), 
        opacity = 0.50)
    )
    
    results.append(result)
    
 
layout = go.Layout(xaxis = go.layout.XAxis(title="CP1", showline=False),
               yaxis = go.layout.YAxis(title="CP2", showline=False))
 
fig = go.Figure(data = results, layout = layout)
fig.show()

## Analisis de componentes principales - Sklearn

In [None]:
import pandas as pd

import chart_studio.plotly as py
import plotly.graph_objects as go


from plotly.graph_objs import *
import plotly.tools as tls

from sklearn.preprocessing import StandardScaler

# tls.set_credentials_file(username='', api_key='')

chart_studio.tools.set_credentials_file(username="", api_key="")

In [None]:
df = pd.read_csv('/content/iris.csv')

In [None]:
X = df.iloc[:, 0:4].values
y = df.iloc[:,4].values
X_std = StandardScaler().fit_transform(X)

In [None]:
from sklearn.decomposition import PCA as sk_pca

In [None]:
acp = sk_pca(n_components=2)
Y = acp.fit_transform(X_std)

In [None]:
import plotly.graph_objects as go # Para hacer graficos con plotly
 
results = []
 
for name in ('setosa', 'versicolor', 'virginica'):
    
    result = go.Scatter(
        x = Y[y==name,0], 
        y = Y[y==name, 1],
        mode = "markers",
        name = name,
        
        marker = go.scatter.Marker(size=12, 
                                   line = go.scatter.marker.Line(
                                                color="rgba(225,225,225,0.2)",
                                                width=0.5), 
        opacity = 0.50)
    )
    
    results.append(result)
    
 
layout = go.Layout(xaxis = go.layout.XAxis(title="CP1", showline=False),
               yaxis = go.layout.YAxis(title="CP2", showline=False))
 
fig = go.Figure(data = results, layout = layout)
fig.show()