In [1]:
# importamos las librerías necesarias para trabajar.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importamos librerias de scikit learn para clasificacion y modelos de SVM.
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [3]:
# importamos librearias de Logistic Regression
from sklearn.linear_model import LogisticRegression

In [4]:
#Importamos librerias de Feature Selection
from sklearn.feature_selection import RFECV
from sklearn import linear_model
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.linear_model import Lasso
from sklearn.linear_model import lasso_path
from sklearn.feature_selection import VarianceThreshold

In [5]:
# Importamos librerias de PCA
from sklearn.decomposition import PCA

In [6]:
pwd

'D:\\Google Drive\\Quinto\\Ciencia de datos\\Clases\\clases\\clase_06'

# Importamos dataset de Wisconsin Breast Cancer

In [7]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
x = data.data
y = data.target
x.shape

(569, 30)

# Separamos nuestros datos en Train y Test

### Para el desafio tomamos test 0.9

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.9, random_state=4)

### Realizamos Auto Scaling

In [9]:
# Step 4: auto scaling train- set (mean = 0, std = 1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(xtrain)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
xtrain_scal = scaler.transform(xtrain)  
xtest_scal = scaler.transform(xtest)  

In [11]:
xtrain_scal.shape

(56, 30)

In [12]:
xtest_scal.shape

(513, 30)

### PCA 
Vamos a obtener primeros  autovalores y autovectores (eigenvalues y eigenvectors) de la matriz de train "xtrain_scal". 

In [13]:
from sklearn.decomposition import PCA
# cantidad de componentes a extraer
n_comps = 10
# definimos PCA
pca = PCA(n_components= n_comps)
pca.fit(xtrain_scal)
# fit_transform del PCA a nuestros datos
zpca_train = pca.transform(xtrain_scal)
zpca_test = pca.transform(xtest_scal)
# obtenemos los auto-valores
eigenvalues = pca.explained_variance_ratio_

In [14]:
# observamos el shape de la matriz obtenida del PCA. Ver que nuestro dataset se redujo de 30 a 10 dimensiones
zpca_train.shape

(56, 10)

In [15]:
zpca_test.shape

(513, 10)

### Definimos el logistic regression

In [16]:
#definimos y entrenamos el modelo
lr_pca = LogisticRegression(solver = 'lbfgs')
lr_pca.fit(zpca_train, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
#predecimos nuestra variable depenediente
ypred_pca = lr_pca.predict(zpca_test)
ypred_pca.shape

(513,)

In [18]:
#vemos que tan acertada estuvo la prediccion
acc_lr_pca = accuracy_score(ypred_pca, ytest)
print("Score:", acc_lr_pca)

Score: 0.9473684210526315


In [19]:
results= pd.DataFrame(columns=['Model','SCORE'])

In [20]:
results= results.append({'Model':'PCA',
                                'SCORE':acc_lr_pca,},ignore_index=True)

## KPCA

In [21]:
from sklearn.decomposition import PCA, KernelPCA

In [22]:
# cantidad de componentes a extraer
n_comps = 10
# definimos PCA
kpca = KernelPCA(n_components= n_comps, kernel = 'rbf', gamma = 0.1)
# fit_transform del PCA a nuestros datos
kpca.fit(xtrain_scal)
# fit_transform del PCA a nuestros datos
zkpca_train = kpca.transform(xtrain_scal)
zkpca_test = kpca.transform(xtest_scal)


In [23]:
# observamos el shape de la matriz obtenida del kPCA. Ver que nuestro dataset se redujo de 30 a 10 dimensiones
zkpca_train.shape

(56, 10)

In [24]:
zkpca_test.shape

(513, 10)

### Defino el logistic regression

In [25]:
#definimos y entrenamos el modelo
lr_kpca = LogisticRegression(solver = 'lbfgs')
lr_kpca.fit(zkpca_train, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
#predecimos nuestra variable depenediente
ypred_kpca = lr_kpca.predict(zkpca_test)
ypred_kpca.shape

(513,)

In [27]:
#vemos que tan acertada estuvo la prediccion
acc_lr_kpca = accuracy_score(ypred_kpca, ytest)
print("Score:", acc_lr_kpca)

Score: 0.9239766081871345


In [28]:
results= results.append({'Model':'KPCA',
                                'SCORE':acc_lr_kpca,},ignore_index=True)

In [29]:
results

Unnamed: 0,Model,SCORE
0,PCA,0.947368
1,KPCA,0.923977


In [38]:
yproba_pca=lr_pca.predict_proba(zpca_test)
fpr1,tpr1,thresholds=roc_curve(ytest.astype('int'),
                              yproba_pca[:,1],
                              drop_intermediate=False)
auc=metrics.auc(fpr1,tpr1)
print('El área bajo la curva del PCA es: ' + str(auc))

El área bajo la curva del PCA es: 0.9861519367692207


In [39]:
yproba_kpca=lr_kpca.predict_proba(zkpca_test)
fpr1,tpr1,thresholds=roc_curve(ytest.astype('int'),
                              yproba_kpca[:,1],
                              drop_intermediate=False)
auc=metrics.auc(fpr1,tpr1)
print('El área bajo la curva del KPCA es: ' + str(auc))

El área bajo la curva del KPCA es: 0.9772356130380823
