# validation_train
Este notebook é dedicado à parte de validação (cross-validation / hold-out validation), calculo do threshold ótimo e aplicação nos modelos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# load the final dataframe from data_prep 
df_data = pd.read_csv('diabetic_data_df.csv')

In [3]:
# Load columns to use from data_prep
col2use = pd.read_csv('col2use.csv')
col2use = col2use['col2use'].tolist()

In [22]:
df_data['OUTPUT_LABEL']

0        0
1        0
2        0
3        0
4        0
5        1
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       1
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
99313    0
99314    0
99315    0
99316    0
99317    0
99318    0
99319    1
99320    0
99321    0
99322    0
99323    0
99324    0
99325    0
99326    0
99327    0
99328    0
99329    0
99330    0
99331    0
99332    0
99333    0
99334    0
99335    0
99336    0
99337    0
99338    0
99339    0
99340    0
99341    0
99342    0
Name: OUTPUT_LABEL, Length: 99343, dtype: int64

In [5]:
col2use

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'race_Asian',
 'race_Caucasian',
 'race_Hispanic',
 'race_Other',
 'race_UNK',
 'gender_Male',
 'gender_Unknown/Invalid',
 'max_glu_serum_>300',
 'max_glu_serum_None',
 'max_glu_serum_Norm',
 'A1Cresult_>8',
 'A1Cresult_None',
 'A1Cresult_Norm',
 'metformin_No',
 'metformin_Steady',
 'metformin_Up',
 'repaglinide_No',
 'repaglinide_Steady',
 'repaglinide_Up',
 'nateglinide_No',
 'nateglinide_Steady',
 'nateglinide_Up',
 'chlorpropamide_No',
 'chlorpropamide_Steady',
 'chlorpropamide_Up',
 'glimepiride_No',
 'glimepiride_Steady',
 'glimepiride_Up',
 'acetohexamide_Steady',
 'glipizide_No',
 'glipizide_Steady',
 'glipizide_Up',
 'glyburide_No',
 'glyburide_Steady',
 'glyburide_Up',
 'tolbutamide_Steady',
 'pioglitazone_No',
 'pioglitazone_Steady',
 'pioglitazone_Up',
 'rosiglitazone_No',
 'rosiglitazone_Steady',
 'rosigli

In [6]:
# shuffle the samples
df_data = df_data.sample(n = len(df_data), random_state = 42)
df_data = df_data.reset_index(drop = True)

## Funções 

Geral para analises

In [7]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [8]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity 

In [9]:
from sklearn.metrics import classification_report, accuracy_score, \
                            average_precision_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, log_loss, confusion_matrix

def report(y_true, y_pred, prefix):
    accuracy = accuracy_score(y_true, y_pred)
    ap      = average_precision_score(y_true, y_pred)
    f1      = f1_score(y_true, y_pred)
    lloss   = log_loss(y_true, y_pred)
    prec    = precision_score(y_true, y_pred)
    recall  = recall_score(y_true, y_pred)
    auc     = roc_auc_score(y_true, y_pred)
    
    print ('{} Accuracy :{:3}'.format(prefix, accuracy))
    print ('{} AP:       {:3}'.format(prefix, ap))
    print ('{} F1-score :{:3}'.format(prefix, f1))
    print ('{} Log-Loss :{:3}'.format(prefix, lloss))
    print ('{} Precision:{:3}'.format(prefix, prec))
    print ('{} Recall   :{:3}'.format(prefix, recall))
    print ('{} AUC      :{:3}'.format(prefix, auc))
    
    return accuracy

def make_confusion_matrix(y_true, y_pred, names, normalized, prefix='', path=''):
    
    title = 'Confusion Matrix'
    cm = confusion_matrix(y_true, y_pred)
     
    if normalized:
        cm = cm.astype('float')/cm.sum(axis=1)[:, np.newaxis]
        
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=names, 
           yticklabels=names,
           ylabel='True Label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right',
                                   rotation_mode='anchor')
    
    fmt = '.2f' if normalized else 'd'
    thresh = cm.max()/2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha='center', va='center',
                    color='green')#'white' if cm[i,j] > thresh else 'black')
    
    fig.tight_layout()
    
    if path != '':
        plt.savefig(path)
        
    plt.show()   

In [10]:
def get_recall_specificity(y_actual, y_pred, thresh):
    recall = recall_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    return [recall, (1-specificity)]

def get_thresh_opt(y_actual, y_pred):
    dist = []
    for th in np.linspace(0, 1, 100):
        a = get_recall_specificity(y_actual, y_pred, th)
        dist.append([np.sqrt(np.power(a - np.array([0,1]), 2).sum()), th, a[0],a[1]])
    dist = np.array(dist)
    i_th_max = np.argmin(dist[:, 0])
    th_max = dist[i_th_max, 1]
    plt.plot(dist[:, 3], dist[:, 2])
    plt.title('recall x (1-specificity)')
    plt.xlabel('1 - specificity')
    plt.ylabel('recall')
    plt.show()
    print('threshold ótimo: %f' %th_max)
    return th_max

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

#print da matriz de confusao e classification report do sklearn
def get_analysis(y_valid, y_valid_preds, thresh):
    y_valid_preds = y_valid_preds > thresh
    print('SKLearn Classification Report')
    print(classification_report(y_valid, y_valid_preds))
    print('Matriz de Confusão')
    print(confusion_matrix(y_valid, y_valid_preds))

## Validation methods

Fazer aqui o estudo do cross-validation e hold-out validation

In [12]:
# pega 30% dos dados totais e usa como dados para teste e validação 
df_valid_test=df_data.sample(frac=0.30,random_state=42)
print('Tamanho (relativo) dos dados utilizados para teste e treino: %.3f'%(len(df_valid_test)/len(df_data)))

# divide os dados de df_valid_test em 50% teste e 50% validação
df_test = df_valid_test.sample(frac = 0.5, random_state = 42)
df_valid = df_valid_test.drop(df_test.index)

print (len(df_test), len(df_valid))
# usa o restante dos dados como dados de treino
df_train_all=df_data.drop(df_valid_test.index)
print (len(df_train_all))

# verifica prevalência de cada dataframe
#print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.OUTPUT_LABEL.values)))
#print('Valid prevalence(n = %d):%.3f'%(len(df_valid),calc_prevalence(df_valid.OUTPUT_LABEL.values)))
#print('Train all prevalence(n = %d):%.3f'%(len(df_train_all), calc_prevalence(df_train_all.OUTPUT_LABEL.values)))

# verifica se todos os dados foram utilizados
print('all samples (n = %d)'%len(df_data))
assert len(df_data) == (len(df_test)+len(df_valid)+len(df_train_all)),'math didnt work'

Tamanho (relativo) dos dados utilizados para teste e treino: 0.300
14902 14901
69540
all samples (n = 99343)


At this point, you might say, drop the training data into a predictive model and see the outcome. However, if we do this, it is possible that we will get back a model that is 89% accurate. Great! Good job! But wait, we never catch any of the readmissions (recall= 0%). How can this happen?

What is happening is that we have an imbalanced dataset where there are much more negatives than positives, so the model might just assigns all samples as negative.

Typically, it is better to balance the data in some way to give the positives more weight. There are 3 strategies that are typically utilized:

- sub-sample the more dominant class: use a random subset of the negatives
- over-sample the imbalanced class: use the same positive samples multiple times
- create synthetic positive data
Usually, you will want to use the latter two methods if you only have a handful of positive cases. Since we have a few thousand positive cases, let's use the sub-sample approach. Here, we will create a balanced training data set that has 50% positive and 50% negative. You can also play with this ratio to see if you can get an improvement

In [23]:
# criação do dataframe de treino 50% positivo e negativo
# split the training data into positive and negative
rows_pos = df_train_all['OUTPUT_LABEL'] == 1
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]

print (np.unique(df_train_all['OUTPUT_LABEL']))
print (len(df_train_all), len(df_train_pos), len(df_train_neg))
# merge the balanced data
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)
print (len(df_train))

#print('Train balanced prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.OUTPUT_LABEL.values)))

[0 1]
69540 7883 61657
15766


In [None]:
print(len(df_data.columns))
print(len(col2use))

## Preparação dos dados de treino e validação para os modelos


In [24]:
print (len(df_train))
X_train = df_train[col2use].values
X_train_all = df_train_all[col2use].values
X_valid = df_valid[col2use].values

y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values

print('Training All shapes:',X_train_all.shape)
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)
print (X_train[0])

15766
Training All shapes: (69540, 143)
Training shapes: (15766, 143) (15766,)
Validation shapes: (14901, 143) (14901,)
[ 3 53  5 18  0  0  0  9  0  0  0  0  1  1  0  0  1  0  0  1  0  1  0  0
  1  0  0  1  0  0  1  0  0  1  0  0  0  1  0  0  1  0  0  0  1  0  0  1
  0  0  1  0  0  1  0  0  0  0  0  1  0  0  1  0  0  0  0  0  0  1  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1 60  0]


In [26]:
# normalização dos dados para alguns modelos
from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
scaler.fit(X_train_all)

X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)



## Modelos


### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(max_depth = 6, random_state = 42)
rf.fit(X_train_tf, y_train)

y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]

thresh = 0.5

print('Random Forest')
print('Training:')
rf_train_auc, rf_train_accuracy, rf_train_recall, rf_train_precision, rf_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
rf_valid_auc, rf_valid_accuracy, rf_valid_recall, rf_valid_precision, rf_valid_specificity = print_report(y_valid,y_valid_preds, thresh)



Random Forest
Training:
AUC:0.681
accuracy:0.631
recall:0.591
precision:0.642
specificity:0.670
prevalence:0.500
 
Validation:
AUC:0.648
accuracy:0.631
recall:0.577
precision:0.169
specificity:0.638
prevalence:0.113
 


In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)

In [28]:
report(y_valid, y_valid_preds)
#make_confusion_matrix(y_valid, y_valid_preds)

TypeError: report() missing 1 required positional argument: 'prefix'

### K Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train_tf, y_train)

y_train_preds = knn.predict_proba(X_train_tf)[:,1]
y_valid_preds = knn.predict_proba(X_valid_tf)[:,1]

print('KNN')
print('Training:')
knn_train_auc, knn_train_accuracy, knn_train_recall, \
    knn_train_precision, knn_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
knn_valid_auc, knn_valid_accuracy, knn_valid_recall, \
    knn_valid_precision, knn_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train_tf, y_train)

y_train_preds = lr.predict_proba(X_train_tf)[:,1]
y_valid_preds = lr.predict_proba(X_valid_tf)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)

### Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
sgdc=SGDClassifier(loss = 'log',alpha = 0.1,random_state = 42)
sgdc.fit(X_train_tf, y_train)

y_train_preds = sgdc.predict_proba(X_train_tf)[:,1]
y_valid_preds = sgdc.predict_proba(X_valid_tf)[:,1]

print('Stochastic Gradient Descend')
print('Training:')
sgdc_train_auc, sgdc_train_accuracy, sgdc_train_recall, sgdc_train_precision, sgdc_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
sgdc_valid_auc, sgdc_valid_accuracy, sgdc_valid_recall, sgdc_valid_precision, sgdc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_tf, y_train)

y_train_preds = nb.predict_proba(X_train_tf)[:,1]
y_valid_preds = nb.predict_proba(X_valid_tf)[:,1]

print('Naive Bayes')
print('Training:')
nb_train_auc, nb_train_accuracy, nb_train_recall, nb_train_precision, nb_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
nb_valid_auc, nb_valid_accuracy, nb_valid_recall, nb_valid_precision, nb_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=3, random_state=42)
gbc.fit(X_train_tf, y_train)

y_train_preds = gbc.predict_proba(X_train_tf)[:,1]
y_valid_preds = gbc.predict_proba(X_valid_tf)[:,1]

print('Gradient Boosting Classifier')
print('Training:')
gbc_train_auc, gbc_train_accuracy, gbc_train_recall, gbc_train_precision, gbc_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
gbc_valid_auc, gbc_valid_accuracy, gbc_valid_recall, gbc_valid_precision, gbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
#threshold ótimo para os dados obtidos do modelo e analise
thresh_opt = get_thresh_opt(y_valid,y_valid_preds)
get_analysis(y_valid,y_valid_preds, thresh_opt)