# Binary Example
A high dimensional data set has monte carlo integration applied.

Useful because of the ability to backtrack from a model, and identify what metrics are significant indicators of issues.

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.svm import OneClassSVM
from itertools import compress

from highd import HighD

In [2]:
plt.style.use("illumina.mplstyle")

## Preprocessing

Load dataset. The breast cancer data set is used as it has a good number of samples for model training, and a number of dimensions that will result in a significantly difficult to interpret model.

In [3]:
iris = load_breast_cancer()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)
print(df["species"].value_counts())
print("Shape:", df.shape)
df.head()

benign       357
malignant    212
Name: species, dtype: int64
Shape: (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,species
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


Select (5%) positives then drop species as it is the target; this demonstrates the method dealing with class imbalance.

In [4]:
pos = df[df["species"] == "malignant"][:int(df.shape[0] * 0.05)]
neg = df[df["species"] == "benign"]
df = pd.concat([pos, neg])

Drop species as this is the target.

In [5]:
targets = df["species"]
df = df.drop(["species"], axis=1)

Scale the data.

In [6]:
hd = HighD(df, targets, "malignant", "benign")

Class balance fixed, Negatives: 216 , Positives: 216


In [7]:
# hd.scatter_plot_matrix()

In [8]:
# Train classifier and check accuracy.
X_train, X_test, y_train, y_test = train_test_split(
    hd.scaled, hd.targets, test_size=0.33, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
preds = rf_clf.predict(X_test)
print("F1 score:", f1_score(y_test, preds, pos_label="malignant"))
print("Confusion matrix:")
print(confusion_matrix(y_test, preds))

F1 score: 0.8333333333333333
Confusion matrix:
[[114   1]
 [  3  10]]


## Classifier Training and Analysis

### Random Forests

In [9]:
def rf_clf_func(df):
    proba = rf_clf.predict_proba(df)
    return np.array([i[1] for i in proba])

In [10]:
hd.density_estimate(rf_clf_func, n=10000, k_dens=0.01, n_bins=50)

Select 5 most important features to visualize.

In [11]:
features = [(f, i) for f, i in zip(X_train.columns, rf_clf.feature_importances_)]   
features.sort(key=lambda x: x[1], reverse=True)
features = [i[0] for i in features[:5]]
print(features)

hd.select_1d_bins(features, n_bins=25)

['worst concave points', 'worst area', 'mean radius', 'worst radius', 'mean concave points']


In [12]:
hd.D.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,prediction
0,0.115811,0.259577,0.124645,0.068292,0.539846,0.234628,0.106018,0.125421,0.293672,0.539615,...,0.268194,0.101525,0.051523,0.738137,0.206277,0.202501,0.385618,0.271796,0.3085,0.0
1,0.426688,0.174395,0.400051,0.308913,0.275713,0.057861,0.003614,0.042854,0.196913,0.081661,...,0.179641,0.317376,0.192822,0.206964,0.015348,0.015864,0.131693,0.101476,0.061691,0.0
2,0.109147,0.195973,0.098429,0.057591,0.340782,0.095427,0.011991,0.018377,0.347612,0.335846,...,0.17003,0.051798,0.021419,0.33327,0.036006,-0.003972,-0.001979,0.189722,0.148415,0.0
3,0.300701,0.473614,0.307601,0.181644,0.320197,0.170337,0.13028,0.131855,0.226323,0.292904,...,0.585853,0.251878,0.121851,0.710979,0.183934,0.24442,0.293901,0.240135,0.226248,0.0
4,0.536793,0.736378,0.565665,0.417169,0.545009,0.521338,0.41213,0.496967,0.647713,0.430852,...,0.838857,0.554882,0.309333,0.705295,0.605194,0.552892,0.631812,0.520292,0.518733,0.52


In [99]:
res_vals = np.linspace(0.0, 1.0, 51)
cut_vals = pd.cut([], res_vals)
cols = hd.D.columns[:-1]
D = hd.D
tmp = D[["mean radius", "mean texture", "prediction"]]

def get_bin_index(val):
    # ━━☆ﾟ
    return np.where(cut_vals.categories.contains(val))[0][0]

def select_bin(row):
    row = get_bin_index(row["mean radius"]) * len(res_vals)
    col = get_bin_index(row["mean texture"])
    return row * col
    
tmp["bin"] = None
# tmp["mean radius bin"] = pd.cut(tmp["mean radius"], bins=res_vals)
# tmp["mean texture bin"] = pd.cut(tmp["mean texture"], bins=res_vals)
# # Create a cartesian product of other bins column.
# tmp["mixed"] = None
# tmp["mixed"] = tmp.apply(
#     lambda row: (row["mean radius bin"], row["mean texture bin"]),
#     axis=1
# )
# # Group by that bin and take mean, giving the mean prediction.
# tmp = tmp.dropna()
# tst = tmp.groupby("mixed").mean()

In [100]:
select_bin(tmp.loc[0])

mean radius     0.115811
mean texture    0.259577
prediction             0
bin                 None
Name: 0, dtype: object


IndexError: invalid index to scalar variable.

In [92]:
# Select index of bin.
ind = np.where(cut_vals.categories.contains(0.9))[0][0]
print(ind)

44


In [89]:
tmp.head()

Unnamed: 0,mean radius,mean texture,prediction,bin
0,0.115811,0.259577,0.0,
1,0.426688,0.174395,0.0,
2,0.109147,0.195973,0.0,
3,0.300701,0.473614,0.0,
4,0.536793,0.736378,0.52,


In [22]:
tst.head(100)

Unnamed: 0_level_0,mean radius,mean texture,prediction
mixed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"((0.0, 0.0204], (0.122, 0.143])",0.008982,0.137823,0.000000
"((0.0, 0.0204], (0.143, 0.163])",0.006132,0.155308,0.000000
"((0.0, 0.0204], (0.163, 0.184])",0.009113,0.165062,0.000000
"((0.0204, 0.0408], (0.143, 0.163])",0.026249,0.146688,0.000000
"((0.0204, 0.0408], (0.592, 0.612])",0.040545,0.610549,0.000000
"((0.0204, 0.0408], (0.633, 0.653])",0.037135,0.646590,0.000000
"((0.0204, 0.0408], (0.653, 0.673])",0.035917,0.659500,0.000000
"((0.0408, 0.0612], (0.592, 0.612])",0.051013,0.606357,0.000000
"((0.0408, 0.0612], (0.612, 0.633])",0.048737,0.621131,0.000000
"((0.0408, 0.0612], (0.633, 0.653])",0.052761,0.643640,0.000000


In [41]:
for i in tst.index:
    if 0.03 in i[0]:
        print(i)
        tst.loc[i]
#         print(i in tst.index)

(Interval(0.0204, 0.0408, closed='right'), Interval(0.143, 0.163, closed='right'))


KeyError: Interval(0.0204, 0.0408, closed='right')

In [24]:
tst.index

Index([   ((0.0, 0.0204], (0.122, 0.143]),    ((0.0, 0.0204], (0.143, 0.163]),
          ((0.0, 0.0204], (0.163, 0.184]), ((0.0204, 0.0408], (0.143, 0.163]),
       ((0.0204, 0.0408], (0.592, 0.612]), ((0.0204, 0.0408], (0.633, 0.653]),
       ((0.0204, 0.0408], (0.653, 0.673]), ((0.0408, 0.0612], (0.592, 0.612]),
       ((0.0408, 0.0612], (0.612, 0.633]), ((0.0408, 0.0612], (0.633, 0.653]),
       ...
          ((0.959, 0.98], (0.163, 0.184]),    ((0.959, 0.98], (0.184, 0.204]),
          ((0.959, 0.98], (0.204, 0.224]),    ((0.959, 0.98], (0.306, 0.327]),
          ((0.959, 0.98], (0.327, 0.347]),    ((0.959, 0.98], (0.347, 0.367]),
            ((0.98, 1.0], (0.327, 0.347]),      ((0.98, 1.0], (0.531, 0.551]),
            ((0.98, 1.0], (0.551, 0.571]),      ((0.98, 1.0], (0.571, 0.592])],
      dtype='object', name='mixed', length=772)

In [None]:
Zm = [[((i + j)/2)-0.5 for i in self.D_bins[cols[x]]] for
                          j in self.D_bins[cols[y]]]
                    axes[x, y].contourf(Xm, Ym, Zm, levels=np.linspace(-0.5,
                                        0.5, 41), cmap="seismic")

In [19]:
raise Exception("Stop!")

Exception: Stop!

In [None]:
hd.density_scatter("worst area")

In [None]:
hd.scatter_plot_matrix(features)

In [None]:
hd.vis_1d(figsize=(32, 8))

Everything red will be classified as malignant, everything blue will be classified as benign.

In [None]:
hd.vis_2d(title="Breast Cancer Classification")

In [None]:
tst = hd.D_bins

In [None]:
tst

### One Class SVM
Used for outlier detection/estimating a learning frontier. The SVM is set to use 10% of points estimated as outliers to set the learning frontier.

In [None]:
oc_clf = OneClassSVM(gamma='auto', nu=0.1).fit(X_train)

In [None]:
def oc_clf_func(df):
    pred = oc_clf.predict(df)
    return [(i + 1) / 2 for i in pred]

In [None]:
tst = oc_clf_func(X_train)
pd.Series(tst).value_counts()

In [None]:
hd.density_estimate(oc_clf_func, n=10000)

In [None]:
hd.density_scatter("mean smoothness")

In [None]:
hd.select_1d_bins(features, n_bins=25)

In [None]:
hd.scatter_plot_matrix(features)

In [None]:
hd.vis_1d(figsize=(32, 8))

Red will be classified as within range of the classifier, while blue will be classified as outliers.

In [None]:
hd.vis_2d(title="Breast Cancer Outlier Classification")

## Random Forest Classifier and Outlier Detection Superposition

In [None]:
def oc_rf_clf_func(df):
    oc_pred = oc_clf_func(df)
    rf_pred = rf_clf_func(df)
    return [(rf_pred[i] - 0.5) * i_val + 0.5 for i, i_val in enumerate(oc_pred)]

In [None]:
tst = oc_rf_clf_func(X_train[:30])
print(tst[:30])

In [None]:
hd.density_estimate(oc_rf_clf_func, n=10000)

In [None]:
hd.density_scatter("mean smoothness")

In [None]:
hd.select_1d_bins(features, n_bins=25)

In [None]:
hd.vis_1d(figsize=(32, 8))

The areas where the classifier is uncertain have been superposed with the areas where samples are taken where training data is not present. The dark colours are strong indicates of good quality accross dimensions.

In [None]:
hd.vis_2d(title="Breast Cancer Outlier Classification Certainty")

## Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 30 dimensions go in, 2 go out. The 2 coming out are the two
        # different classes available.
        self.fc1 = nn.Linear(X_train.shape[1], 3)
        self.fc2 = nn.Linear(3, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = F.tanh(x)
        x = self.fc2(x)
        return x

    def predict(self, x):
        # Calculate probabilities of each class.
        pred = F.softmax(self.forward(x))
        
        # Return the largest weighted class.
        return torch.tensor([0 if i[0] > i[1] else 1 for i in pred])
    
    def predict_proba(self, x):
        # Calculate probabilities of each class.
        pred = F.softmax(self.forward(x))
        
        # Return the largest weighted class.
        return torch.tensor([i[1] / (i[0] + i[1]) for i in pred])

This may need to be rerun to prevent the stochastic trap.

In [None]:
net = Net()

# Prepare data for neural network.
inputs = torch.FloatTensor(X_train.values)
labels = torch.LongTensor([1 if i == "malignant" else 0 for i in y_train])

# Optimizer to speed things up, cross entropy loss as it is a classifcation problem.
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
epochs = 30000

losses = []
for epoch in range(epochs):
    outputs = net.forward(inputs)
    loss = criterion(outputs, labels)
    losses.append(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        print("epoch:", epoch, "loss:", loss)
    
print('Finished Training ')

Test NN accuracy.

In [None]:
test_input = torch.FloatTensor(X_test.values)
preds = np.array(net.predict(test_input))
test_labels = [1 if i == "malignant" else 0 for i in y_test]
print(preds)
print("F1 score:", f1_score(test_labels, preds, pos_label=1))
print("Confusion matrix:")
print(confusion_matrix(test_labels, preds))

The actual classifier function uses the raw certainty of the model.

In [None]:
preds = np.array(net.predict_proba(test_input))
for i in zip(preds, test_labels):
    print(i)

In [None]:
# Certainty 
def nn_clf_func(df):
    test_tensor = torch.FloatTensor(df.values)
    return np.array(net.predict_proba(test_tensor))

In [None]:
hd.density_estimate(nn_clf_func, n=10000)

In [None]:
hd.density_scatter("worst area")

In [None]:
hd.select_1d_bins(features, n_bins=25)
hd.vis_1d(figsize=(32, 8))
hd.vis_2d(title="Breast Cancer Classification NN")

## NN Classifier and Outlier Detection Superposition

In [None]:
# Certainty 
def oc_nn_clf_func(df):
    oc_pred = oc_clf_func(df)
    nn_pred = nn_clf_func(df)
    return [(nn_pred[i] - 0.5) * i_val + 0.5 for i, i_val in enumerate(oc_pred)]

In [None]:
tst = oc_nn_clf_func(X_train)
print(tst[:30])

In [None]:
hd.density_estimate(oc_nn_clf_func, n=10000)
hd.density_scatter("mean smoothness")
hd.select_1d_bins(features, n_bins=50)
hd.vis_1d(figsize=(32, 8))
hd.vis_2d(title="Breast Cancer Classification NN")

In [None]:
hd.vis_1d_separate("Breast Cancer Metric Effects on Diagnosis")