## Import modules

In [1]:
! pip install joblib

Collecting joblib
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K    100% |████████████████████████████████| 286kB 20.6MB/s 
[31mfastai 1.0.52 requires nvidia-ml-py3, which is not installed.[0m
[31mthinc 6.12.1 has requirement msgpack<0.6.0,>=0.5.6, but you'll have msgpack 0.6.0 which is incompatible.[0m
[?25hInstalling collected packages: joblib
Successfully installed joblib-0.13.2
[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [20]:
import pdb
import glob
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import importlib
import nhanes as nhanes
importlib.reload(nhanes)


%matplotlib notebook

print(nhanes)

<module 'nhanes' from '/home/ec2-user/SageMaker/nhanes.py'>


In [3]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting normalize=True.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

## Settings

In [4]:
# DATA_PATH = '/home/orpgol/OpportunisticLearning/CDC/NHANES/'
DATA_PATH = 'CDC/NHANES/'
DATASET = 'arthritis'

### Note: 
The code below loads each dataset: dataset_features, dataset_targets

Here, all datasets are defined explicitly (see nhanes.py).

In [52]:
importlib.reload(nhanes)
ds = nhanes.Dataset(DATA_PATH)
ds.load_arthritis()
n_fe = ds.features.shape[1]
n_classes = 2

indx = np.argwhere(ds.targets != 3)
dataset_features_og = ds.features[indx.flatten()]
dataset_targets = ds.targets[indx.flatten()]

print(type(dataset_features_og))
print(type(dataset_targets))

--------------------------------------------------
Processing: IMQ_H.XPT(92062, 199)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
# importlib.reload(nhanes)
# print((dataset_features.shape))
# print((dataset_targets.shape))

In [64]:
dataset_features = dataset_features_og

In [47]:
# pca = PCA(n_components=25)
# dataset_features = copy.deepcopy(dataset_features_og)
# dataset_features = pca.fit_transform(dataset_features)

In [65]:
from sklearn import decomposition

n_comp = 10
svd = decomposition.TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd.fit(dataset_features_og)
print(svd.explained_variance_ratio_.sum())
dataset_features = copy.deepcopy(dataset_features_og)
dataset_features = svd.transform(dataset_features)
# test_features = svd.transform(test_features)

0.3533013334279683


## Train/Test Separation

In [66]:
importlib.reload(nhanes)
perm = np.random.permutation(dataset_targets.shape[0])
dataset_features = dataset_features[perm]
dataset_targets = dataset_targets[perm]

def get_batch(n_size, phase):
    # select indices
    n_samples = dataset_features.shape[0]
    n_classes = int(dataset_targets.max() + 1)
    if phase == 'test':
        inds_sel = np.arange(0, int(n_samples*0.15), 1)
    elif phase == 'validation':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.15), int(n_samples*0.30), 1)
    elif phase == 'train':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.30), n_samples, 1)
    else:
        raise NotImplementedError
    inds_sel = np.random.permutation(inds_sel)
    batch_inds = []
    for cl in range(n_classes):
        inds_cl = inds_sel[dataset_targets[inds_sel] == cl]
        batch_inds.extend(inds_cl[:n_size//n_classes])
    batch_inds = np.random.permutation(batch_inds)
    
    return dataset_features[batch_inds], dataset_targets[batch_inds]
    
features_trn, targets_trn = get_batch(n_size=20000, phase='train')
features_tst, targets_tst = get_batch(n_size=10000, phase='test')

## Classification

In [67]:
importlib.reload(nhanes)
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('Random Forest Classifier', accu)
# print(clf.feature_importances_)
print(classification_report(targets_tst, preds_tst))
cm = confusion_matrix(targets_tst, preds_tst)
print(cm)

clf = SVC(gamma='auto', class_weight='balanced')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('Support Vector Classifier', accu)
print(classification_report(targets_tst, preds_tst))
cm = confusion_matrix(targets_tst, preds_tst)
print(cm)

clf = LogisticRegression(solver='lbfgs', max_iter=200, class_weight='balanced')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('Logistic Regression', accu)
print(classification_report(targets_tst, preds_tst))
cm = confusion_matrix(targets_tst, preds_tst)
print(cm)

clf = LinearDiscriminantAnalysis(solver='svd')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('Linear Discriminant Analysis', accu)
print(classification_report(targets_tst, preds_tst))
cm = confusion_matrix(targets_tst, preds_tst)
print(cm)

Random Forest Classifier 0.8826232985681457
              precision    recall  f1-score   support

           0       0.23      0.00      0.01       657
           1       0.88      1.00      0.94      5000

   micro avg       0.88      0.88      0.88      5657
   macro avg       0.56      0.50      0.47      5657
weighted avg       0.81      0.88      0.83      5657

[[   3  654]
 [  10 4990]]
Support Vector Classifier 0.592893759943433
              precision    recall  f1-score   support

           0       0.12      0.39      0.18       657
           1       0.89      0.62      0.73      5000

   micro avg       0.59      0.59      0.59      5657
   macro avg       0.50      0.50      0.46      5657
weighted avg       0.80      0.59      0.67      5657

[[ 255  402]
 [1901 3099]]
Logistic Regression 0.5059218667138059
              precision    recall  f1-score   support

           0       0.12      0.50      0.19       657
           1       0.89      0.51      0.64      5000

 

  'precision', 'predicted', average, warn_for)


In [60]:
# example: plot_confusion_matrix(targets_tst,preds_tst,classes=['Yes Cancer', 'No Cancer'],title='Random Forest Classifier')