# Temas: Clasificación
<br/><br/>

<center>
    
## Taller de Ciencia de Datos
### Omar Piña Ramírez
### Instituto Nacional de Perinatología
### Departamento de Bioinformática y Análisis Estadísticos
### Investigador en Ciencias Médicas
### delozath@gmail.com
</center>

In [None]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:1500px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
.CodeMirror{
    font-size: 15px;
}

.rendered_html table, .rendered_html td, .rendered_html th {
    font-size: 120%;
}
</style>

In [None]:
import numpy   as np
import pandas  as pd
import seaborn as sns

from   matplotlib import pyplot as plt

import ipywidgets as widgets
from   ipywidgets import interact, interact_manual, FloatSlider, Layout

import chart_studio.plotly as py
import plotly.graph_objs   as go
import plotly.express      as px
from   plotly.offline      import iplot, init_notebook_mode

import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode (connected=True)

## Aprendizaje supervisado -> Clasificación

### Datos sintéticos de archivo

In [None]:
PATH = './data/'
file = 'blobs.csv'

df_blobs = pd.read_csv(PATH + file)
df_blobs

In [None]:
df_blobs['Label'] = df_blobs['Label'].astype('int')
df_blobs

In [None]:
params = {'width':700, 'height':700, 'color_discrete_sequence':['black','orange']}
@interact
def scatter_plot(n_samples=(50,5000,100)):
    fig = px.scatter(df_blobs.loc[:n_samples], x='X0', y='X1', color='Label', **params)
    fig.update_traces(marker={'size': 9})
    fig.show()

### 1. Scrubbing

### 2. Prevalencia de clases

In [None]:
@interact
def hist_plot(x=df_blobs.columns):
             df_blobs[x].iplot(kind='hist', x=x, 
             xTitle=x.title(),  
             title=f'Histograma')

### 3. Split: Train-Test-Validation

In [None]:
NT    = 0.8
NV    = 1 - NT
N     = df_blobs.shape[0]
index = np.arange(N)
np.random.shuffle(index)

tune     = index[:int(NT*N)]
validate = index[-int(NV*N):]

In [None]:
X = df_blobs.iloc[tune][['X0','X1']]
y = df_blobs.iloc[tune][['Label']]

X_Val = df_blobs.iloc[validate][['X0','X1']]
y_Val = df_blobs.iloc[validate][['Label']]

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.svm             import SVC
from sklearn.metrics         import recall_score as sensitivity

k     = 10
sf    = ShuffleSplit(n_splits=k,test_size=.25)
model = SVC(kernel='linear')

sen_perform = []
spe_perform = []
for train,test in sf.split(X,y):
    model.fit(X.iloc[train],y.iloc[train].values.ravel())
    L   = model.predict(X.iloc[test])
    sen = sensitivity(y.iloc[test],L)
    spe = sensitivity(y.iloc[test],L,pos_label=0)
    print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))
    sen_perform.append(sen)
    spe_perform.append(spe)

sen_perform = np.array(sen_perform)
spe_perform = np.array(spe_perform)
print("\nPerformance")
print("Sensitivity: {:.3f} ± {:.3f}".format(sen_perform.mean(),sen_perform.std()))
print("Specificity: {:.3f} ± {:.3f}".format(spe_perform.mean(),spe_perform.std()))

In [None]:
model = SVC(kernel='linear')

model.fit(X,y.values.ravel())

In [None]:
L   = model.predict(X_Val)
sen = sensitivity(y_Val,L)
spe = sensitivity(y_Val,L,pos_label=0)

print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))

In [None]:
from sklearn.metrics import plot_confusion_matrix as plt_cm

plt_cm(model,X_Val,y_Val,cmap='inferno')
plt.show()

### Datos Infarto

In [None]:
file = 'cardiovascular.csv'

data  = pd.read_csv(PATH + file)

### Scrubbing

In [None]:
replace = {'famhist':{'Present':1,'Absent':0}}
data    = data.drop(columns='ind')
data    = data.replace(replace)
data

In [None]:
NT    = 0.8
NV    = 1 - NT
N     = data.shape[0]
index = np.arange(N)
np.random.shuffle(index)

tune     = index[:int(NT*N)]
validate = index[-int(NV*N):]

In [None]:
cols = ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']
lab  = 'chd'
X    = data.iloc[tune][cols]
y    = data.iloc[tune][lab]

X_Val = data.iloc[validate][cols]
y_Val = data.iloc[validate][lab]

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.svm             import SVC
from sklearn.metrics         import recall_score as sensitivity

k     = 10
sf    = ShuffleSplit(n_splits=k,test_size=.25)
model = SVC(kernel='linear')

sen_perform = []
spe_perform = []
for train,test in sf.split(X,y):
    model.fit(X.iloc[train],y.iloc[train].values.ravel())
    L   = model.predict(X.iloc[test])
    sen = sensitivity(y.iloc[test],L)
    spe = sensitivity(y.iloc[test],L,pos_label=0)
    print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))
    sen_perform.append(sen)
    spe_perform.append(spe)

sen_perform = np.array(sen_perform)
spe_perform = np.array(spe_perform)
print("\nPerformance")
print("Sensitivity: {:.3f} ± {:.3f}".format(sen_perform.mean(),sen_perform.std()))
print("Specificity: {:.3f} ± {:.3f}".format(spe_perform.mean(),spe_perform.std()))

In [None]:
model = SVC(kernel='linear')

model.fit(X,y.values.ravel())

In [None]:
L   = model.predict(X_Val)
sen = sensitivity(y_Val,L)
spe = sensitivity(y_Val,L,pos_label=0)

print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))

In [None]:
from sklearn.metrics import plot_confusion_matrix as plt_cm

plt_cm(model,X_Val,y_Val,cmap='inferno')
plt.show()

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.svm             import SVC
from sklearn.ensemble        import RandomForestClassifier
from sklearn.neural_network  import MLPClassifier
from sklearn.metrics         import recall_score as sensitivity
from sklearn.metrics         import plot_confusion_matrix as plt_cm

k       = 10
sf      = ShuffleSplit(n_splits=k,test_size=.25)
n_estim = [200, 500,1000,1500,2000,2500]
gamma   = [0.0001,0.0005,0.001,0.01,0.1]

@interact
def classifiers(classifier=['SVM','RSVM','RF','MLP'],n_estim=n_estim,gamma=gamma):
    if classifier=='SVM':
        model = SVC(kernel='linear')
        title = 'SVM Linear'
    elif classifier=='RF':
        model = RandomForestClassifier(n_estimators=n_estim)
        title = 'Random Forest'
    elif classifier=='RBF':
        model = SVC(kernel='rbf',gamma=gamma)
        title = 'SVM RBF'
    else:
        model = MLPClassifier(max_iter=1000)
        title = 'Perceptron multicapa'
    
    sen_perform = []
    spe_perform = []
    for train,test in sf.split(X,y):
        model.fit(X.iloc[train],y.iloc[train].values.ravel())
        L   = model.predict(X.iloc[test])
        sen = sensitivity(y.iloc[test],L)
        spe = sensitivity(y.iloc[test],L,pos_label=0)
        print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))
        sen_perform.append(sen)
        spe_perform.append(spe)
    
    sen_perform = np.array(sen_perform)
    spe_perform = np.array(spe_perform)
    print("\nPerformance")
    print("Sensitivity: {:.3f} ± {:.3f}".format(sen_perform.mean(),sen_perform.std()))
    print("Specificity: {:.3f} ± {:.3f}".format(spe_perform.mean(),spe_perform.std()))
    
    model.fit(X,y.values.ravel())
    
    L   = model.predict(X_Val)
    sen = sensitivity(y_Val,L)
    spe = sensitivity(y_Val,L,pos_label=0)
    
    print("Sensitivity: {:.3f}, Specificity: {:3f}".format(sen,spe))
    
    plt_cm(model,X_Val,y_Val,cmap='inferno')
    plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

model = RandomForestClassifier(n_estimators=1000)
model.fit(X,y.values.ravel())
df    = pd.DataFrame({'Importancia':model.feature_importances_,'Variable':cols})
df    = df.sort_values(by=['Importancia'])

plt.figure(figsize=(12,7))
sns.barplot(y='Variable', x='Importancia', data=df, color='orange')
plt.grid(True)
plt.show()