# Análisis de Componentes Principales - Paso a Paso

* Estandarizar los datos (para cada una de las m observaciones)
* Obtener los vectores y valores propios a partir de la matriz de covarianzas o de correlaciones o incluso la técnica de singular vector decomposition.
* Ordenar los valores propios en orden descendente y quedarnos con los *p* que se correpondan a los *p* mayores y así disminuir el número de variables del dataset (p<m)
* Constrir la matriz de proyección W a partir de los p vectores propios
* Transformar el dataset original X a través de W para así obtener dadtos en el subespacio dimensional de dimensión *p*, que será Y

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv("../datasets/iris/iris.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/iris/iris.csv'

# Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe(include='all')

In [None]:
df.groupby(by='Species').describe()

In [None]:
sns.scatterplot(data=df, x='Sepal.Length', y='Sepal.Width', hue='Species')

In [None]:
g = sns.pairplot(df)
g.fig.set_size_inches(10, 10)

In [None]:
g = sns.pairplot(df, hue='Species')
g.fig.set_size_inches(10, 10)

In [None]:
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X[0]

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
X_std = (X - np.mean(X, axis=0))/np.std(X, axis=0)

In [None]:
X_std[:3, :]

In [None]:
pd.DataFrame(X_std).describe()

In [None]:
df_dummy = df.copy()
df_dummy.iloc[:, :4] = X_std
df_dummy

In [None]:
g = sns.pairplot(df_dummy, hue='Species')
g.fig.set_size_inches(10, 10)

### 1- Calculamos la descomposición de valores y vectores propios
##### a) Usando la Matriz de Covarianzas

In [None]:
from IPython.display import display, Math, Latex

In [None]:
display(Math(r'\sigma_{jk} = \frac{1}{n-1}\sum_{i=1}^m (x_{ij} - \overline{x_j})(x_{ik} - \overline{x_k})'))

In [None]:
display(Math(r'\Sigma = \frac{1}{n-1}((X-\overline{x})^T(X-\overline{x}))'))

In [None]:
display(Math(r'\overline{x} = \frac{1}{n}\sum_{i=1}^n x_i\in \mathbb R^m'))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
mean_vect = np.mean(X_std, axis=0)
mean_vect

In [None]:
X_std[:3, :]

In [None]:
(X_std - mean_vect).T.shape

In [None]:
cov_matrix = (X_std - mean_vect).T.dot((X_std - mean_vect))/(X_std.shape[0]-1)
print("La matriz de covarianzas es \n%s"%cov_matrix)

In [None]:
np.cov(X_std.T)

In [None]:
eig_vals, eig_vectors = np.linalg.eig(cov_matrix)
print("Valores propios \n%s"%eig_vals)
print("Vectores propios \n%s"%eig_vectors)

##### b) Usando la Matriz de Correlaciones

In [None]:
corr_matrix = np.corrcoef(X_std.T)
corr_matrix

In [None]:
eig_vals_corr, eig_vectors_corr = np.linalg.eig(corr_matrix)
print("Valores propios \n%s"%eig_vals_corr)
print("Vectores propios \n%s"%eig_vectors_corr)

In [None]:
corr_matrix = np.corrcoef(X.T)
corr_matrix

##### c) Singular Value Decomposition

In [None]:
u,s,v = np.linalg.svd(X_std.T)
u

In [None]:
s

In [None]:
v

### 2 - Las componentes principales

In [None]:
eig_vals, eig_vectors = eig_vals_corr, eig_vectors_corr

In [None]:
for ev in eig_vectors:
    print("La longitud del VP es: %s"%np.linalg.norm(ev))

In [None]:
eig_vals

In [None]:
eigen_pairs = [(np.abs(eig_vals[i]), eig_vectors[:,i]) for i in range(len(eig_vals))]
eigen_pairs

Ordenamos los vectores propios con valor propio de mayor a menor

In [None]:
eigen_pairs.sort()
eigen_pairs.reverse()
eigen_pairs

In [None]:
print("Valores propios en orden descendente:")
for ep in eigen_pairs:
    print(ep[0])

In [None]:
total_sum = sum(eig_vals)
var_exp = [(i/total_sum)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [None]:
print('El porcentaje de información que cada valor propio aporta es:')
var_exp

In [None]:
print('El porcentaje de información acumulado en orden es:')
cum_var_exp

In [None]:
plt.figure(figsize=(5, 5))

x=["CP %s"%i for i in range(1,5)]

plt.bar(x, var_exp)
plt.plot(x, cum_var_exp, '.-', label="% de Varianza Explicada Acumulada", c='orange', linewidth=5, markersize=20)

plt.xlabel("Componentes principales", fontsize=11)
plt.ylabel("Porcentaje de varianza explicada", fontsize=11)
plt.title("Porcentaje de variabilidad explicada por cada componente principal", fontsize=15)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

plt.legend(loc='best', fontsize=11)

In [None]:
eigen_pairs

In [None]:
np.stack((eigen_pairs[0][1], 
          eigen_pairs[1][1]), 
         axis=1)

In [None]:
W = np.hstack((eigen_pairs[0][1].reshape(4,1), 
               eigen_pairs[1][1].reshape(4,1)))
W

In [None]:
X_std.dot(W).shape

### 3- Proyectando las variables en el nuevo subespacio vectorial

In [None]:
display(Math(r'Y = X \cdot W, X \in M(\mathbb R)_{150, 4}, W \in M(\mathbb R)_{4,2}, Y \in M(\mathbb R)_{150, 2}'))

In [None]:
Y = X_std.dot(W)
Y

In [None]:
plt.figure(figsize=(5, 5))

for name in ('setosa', 'versicolor', 'virginica'):
    plt.scatter(Y[y==name,0], Y[y==name,1], label=name)

plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")

plt.legend()
plt.show()

# Sklearn

In [None]:
from sklearn.decomposition import PCA

In [None]:
acp = PCA(n_components=2)
skY = acp.fit_transform(X_std)

In [None]:
acp.components_, acp.explained_variance_

In [None]:
plt.figure(figsize=(5, 5))

for name in ('setosa', 'versicolor', 'virginica'):
    plt.scatter(skY[y==name,0], skY[y==name,1], label=name)

plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")

plt.legend()
plt.show()

In [None]:
import numpy as np
vec = np.array([])
vec

In [None]:
np.append([vec, [1, 2]], axis=1)

In [None]:
vec = np.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0)

In [None]:
vec = np.append(vec, [[1, 2, 3]], axis=0)

In [None]:
vec

In [None]:
np.array([[1, 2, 3], [4, 5, 6]]).shape