# **PCA**

## Import data

In [24]:
import pandas as pd                  
import numpy as np  
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import scale
from sklearn import decomposition

In [25]:
df = pd.read_csv('/content/total_merged_train.csv')
df = df.drop(columns=['customerid'] + ['systemloanid'])
y = df['good_bad_flag']

**Dummies**

In [26]:
dummy_int = pd.get_dummies(df[['interest_interval']])
dummy_bank_name = pd.get_dummies(df[['bank_name_clients']], drop_first=True)
dummy_td = pd.get_dummies(df[['termdays']], drop_first=True)
dummy_age = pd.get_dummies(df[['age_interval']])
dummy_state = pd.get_dummies(df[['state']], drop_first=True)
dummy_loan = pd.get_dummies(df[['loan_interval']], drop_first=True)
dummy_loannum = pd.get_dummies(df[['loannumber']], drop_first=True)

df = pd.concat([df, dummy_int, dummy_bank_name, dummy_td, dummy_age, dummy_state, dummy_loan, dummy_loannum], axis=1)
df = df.drop(['interest_interval','bank_name_clients', 'termdays', 'age_interval','state','loan_interval','loannumber'], 1)

df.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,loanamount,interest,good_bad_flag,avg_loanamount,avg_paym_span_hrs,avg_active_span_days,avg_interest,max_loannumber,age,interest_interval_3000,interest_interval_mas de 3000,interest_interval_menos de 3000,bank_name_clients_Multinational Banks,termdays_Other,age_interval_26-35,age_interval_36-40,age_interval_41-61,state_Other,loan_interval_mas de 10000,loannumber_0-4
0,20000.0,4500.0,1,14000.0,12.8,19.0,2700.0,5.0,39.0,0,1,0,1,0,0,1,0,1,1,0
1,10000.0,3000.0,1,10000.0,13.0,29.0,3000.0,2.0,42.0,1,0,0,1,0,0,0,1,1,0,1
2,10000.0,3000.0,0,10000.0,7.0,57.0,3000.0,1.0,45.0,1,0,0,0,0,0,0,1,1,0,1
3,10000.0,3000.0,1,10000.0,18.0,14.0,1500.0,1.0,40.0,1,0,0,1,0,0,1,0,0,0,1
4,20000.0,4500.0,1,14000.0,9.0,19.6,2700.0,5.0,37.0,0,1,0,1,0,0,1,0,0,1,0


## Aplicamos el análisis de componentes principales 

**Estandarizamos**

In [27]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(df.drop(['good_bad_flag'], axis=1))
print(np.mean(X))
print(np.var(X))

-3.2165860061821575e-17
0.9999999999999999


In [28]:
#Apreciamos que los elementos de la diagonal son iguales y que la matriz es simétrica
features = X.T
cov_matrix = np.cov(features)
cov_matrix[:3]

array([[ 1.0011919 ,  0.66232352,  0.89297155, -0.04292662, -0.00216285,
         0.51145942,  0.86192984,  0.01130977, -0.52803206,  0.75242152,
        -0.24558433,  0.00565238, -0.03159391,  0.07903358, -0.02530674,
        -0.05713395, -0.03431531,  0.82125899, -0.72919884],
       [ 0.66232352,  1.0011919 ,  0.50125701, -0.02622892,  0.14682119,
         0.52412119,  0.47737314,  0.07783364, -0.17678618,  0.76540745,
        -0.64059525,  0.05186283, -0.18042357,  0.01434692, -0.01811273,
         0.00321714,  0.00690335,  0.62960596, -0.51918554],
       [ 0.89297155,  0.50125701,  1.0011919 , -0.03070116,  0.06893252,
         0.59620147,  0.86403831, -0.00638712, -0.40871052,  0.56235192,
        -0.16829517, -0.00997013, -0.0244937 ,  0.07690971, -0.03106334,
        -0.04912867, -0.04191722,  0.64245864, -0.6857781 ]])

**Varianza**

In [29]:
#Representamos el porcentaje de la varianza que explica cada componente

values, vectors = np.linalg.eig(cov_matrix)
explained_variances = []
for i in range(len(values)):
    explained_variances.append(values[i] / np.sum(values))
 
print(np.sum(explained_variances), '\n', explained_variances)

#Vemos que con tres componentes explicaríamos aproximadamente el 53% de la varianza
#6 componentes explicarían aproximadamente el 73%

0.9999999999999998 
 [0.3154431165810269, 0.1408874075141413, 0.0820256344388089, 0.07736130029575279, 0.06666100816035446, 0.060506890850043134, 0.05246644334831758, 0.049058821342015155, 0.04311688804816982, 0.038610282575631844, 0.027440766246218985, 0.017983117345699955, 0.011976505301191686, 0.007230701905388631, 0.004386304045569257, 0.002225147567352932, 0.0026196644343163436, -5.783985094830804e-17, 1.1793848678530997e-16]


**Modelo**

In [30]:
pca = decomposition.PCA(n_components=6)
x = pca.fit_transform(X)


**Saturación de variables**

In [31]:
#Vemos la saturación de cada variable en cada uno de los compoenentes
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3', 'PC4','PC5','PC6'], index= df.drop(['good_bad_flag'],axis=1).columns)
loadings

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
loanamount,0.383484,0.072383,-0.000416,0.002325,0.06461,-0.017629
interest,0.311376,-0.187052,-0.043391,-0.056342,-0.078571,0.018897
avg_loanamount,0.347385,0.055424,0.058631,0.017768,0.226077,-0.03892
avg_paym_span_hrs,-0.025437,-0.015041,0.044805,-0.113818,0.216576,0.297775
avg_active_span_days,0.034922,-0.328916,0.226249,0.055112,0.505425,0.057381
avg_interest,0.238668,-0.263433,0.160229,-0.006796,0.428157,0.029337
max_loannumber,0.353554,0.093011,0.027542,0.040781,0.117375,-0.020102
age,0.008751,-0.041384,-0.018218,0.12417,0.067889,0.489523
interest_interval_3000,-0.213503,-0.454138,0.054893,-0.090541,0.017781,-0.099513
interest_interval_mas de 3000,0.354867,-0.01941,-0.101832,-0.024844,-0.231496,0.015996


In [32]:
#Aquí vemos mejor qué variable pertenece a cada componente
loadings.abs().style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
loanamount,0.383484,0.072383,0.000416,0.002325,0.06461,0.017629
interest,0.311376,0.187052,0.043391,0.056342,0.078571,0.018897
avg_loanamount,0.347385,0.055424,0.058631,0.017768,0.226077,0.03892
avg_paym_span_hrs,0.025437,0.015041,0.044805,0.113818,0.216576,0.297775
avg_active_span_days,0.034922,0.328916,0.226249,0.055112,0.505425,0.057381
avg_interest,0.238668,0.263433,0.160229,0.006796,0.428157,0.029337
max_loannumber,0.353554,0.093011,0.027542,0.040781,0.117375,0.020102
age,0.008751,0.041384,0.018218,0.12417,0.067889,0.489523
interest_interval_3000,0.213503,0.454138,0.054893,0.090541,0.017781,0.099513
interest_interval_mas de 3000,0.354867,0.01941,0.101832,0.024844,0.231496,0.015996


**Nuevo dataframe de los componentes**

In [33]:
#Creamos un dataframe con los tres componentes y la variable target

projected_1 = X.dot(vectors.T[0])
projected_2 = X.dot(vectors.T[1])
projected_3 = X.dot(vectors.T[2])
projected_4 = X.dot(vectors.T[3])
projected_5 = X.dot(vectors.T[4])
projected_6 = X.dot(vectors.T[5])

res = pd.DataFrame(projected_1, columns=['PC1'])
res['PC2'] = projected_2
res['PC3'] = projected_3
res['PC4'] = projected_4
res['PC5'] = projected_5
res['PC6'] = projected_6
res['Y'] = y
res.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,Y
0,-2.033963,0.112226,1.800107,1.359615,0.576436,-0.67036,1
1,1.691348,2.347953,0.030756,-1.133771,-0.149269,-0.901944,1
2,1.692131,3.287265,-0.897788,-1.633058,-2.054236,0.453115,0
3,2.295393,0.749977,1.688313,1.321681,0.591209,0.387908,1
4,-2.083365,-0.037694,1.908369,0.809039,0.753411,0.740301,1


##Extraer CSV

In [23]:
res.to_csv('PCA6_train.csv', index=False)

In [34]:
#repetimos el proceso con el csv de test (fijarse en el csv que cargamos al principio)
res.to_csv('PCA6_test.csv', index=False)