In [1]:
import os
import random
import pandas as pd
import numpy as np

import torch
from torch import nn
import skorch

import matplotlib.pyplot as plt
import seaborn as sns
from jupyterthemes import jtplot

from scipy.io import arff

%matplotlib inline
jtplot.style('gruvboxd')

In [2]:
DATA_DIR = os.path.join(os.getcwd().replace('Notebooks', 'data'), 'ct-dataset.csv')

In [3]:
### Load dataset
data = arff.loadarff(DATA_DIR)

In [4]:
### Reprocucibility
os.environ['PYTHONHASHSEED'] = '0'
torch.manual_seed(2022)
np.random.default_rng(2022)
random.seed(2022)

In [5]:
type(data)

tuple

In [6]:
type(data[0])

numpy.ndarray

In [7]:
data[0].shape

(53500,)

In [8]:
new_data = pd.DataFrame(data[0])

In [9]:
new_data.describe()

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
count,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,...,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0
mean,47.075701,0.059627,0.071558,0.145819,0.218728,0.274762,0.276189,0.204531,0.062281,-0.042025,...,-0.029404,0.182913,0.320112,0.359373,0.342889,0.266091,0.083049,-0.031146,-0.154524,47.028039
std,27.41424,0.174243,0.196921,0.30027,0.359163,0.378862,0.369605,0.351294,0.292232,0.268391,...,0.085817,0.383333,0.463517,0.478188,0.471811,0.437633,0.279734,0.098738,0.122491,22.347042
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,1.738733
25%,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,29.891607
50%,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,43.987893
75%,70.0,0.0,0.0,0.0,0.446429,0.684477,0.662382,0.441412,0.0,0.0,...,0.0,0.0,0.996286,0.999677,0.99956,0.949478,0.0,0.0,0.0,63.735059
max,96.0,1.0,1.0,1.0,1.0,0.99879,0.996468,0.999334,1.0,1.0,...,0.961279,1.0,1.0,1.0,1.0,1.0,0.999857,0.996839,0.942851,97.489115


In [10]:
new_data.head()

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


In [11]:
new_data.shape

(53500, 386)

In [12]:
new_data.dtypes

patientId    float64
value0       float64
value1       float64
value2       float64
value3       float64
              ...   
value380     float64
value381     float64
value382     float64
value383     float64
reference    float64
Length: 386, dtype: object

In [13]:
### Possibly categorical features
cat_features = new_data.nunique()[new_data.nunique() < 50].index
new_data.nunique()[new_data.nunique() < 50].sample(n = 10)

value311     3
value68      2
value59      1
value168     3
value99      4
value262     6
value270    27
value209    13
value367     2
value48      2
dtype: int64

In [14]:
### Features and targets
X, y = new_data.drop(labels = ['patientId', 'reference'], axis = 1), new_data.iloc[:, -1]
y = y.values.reshape(len(X), 1)

In [15]:
### Drop invariant features
to_drop = []

for feature in cat_features:
    counts = X[feature].value_counts(normalize = True)
    print(f'Feature diagnostics (Feature {feature})')
    
    if counts.max() > 0.80:
        to_drop.append(feature)
    
    for c in counts.index:
        print(f'\t{c} has a count(s) of {counts[c] : .4f}')
    
    print('+'*(100))

Feature diagnostics (Feature value37)
	0.0 has a count(s) of  0.8284
	-0.25 has a count(s) of  0.1710
	0.091011 has a count(s) of  0.0001
	0.61636 has a count(s) of  0.0000
	0.340495 has a count(s) of  0.0000
	0.54983 has a count(s) of  0.0000
	0.484688 has a count(s) of  0.0000
	0.536246 has a count(s) of  0.0000
	0.502759 has a count(s) of  0.0000
	0.581729 has a count(s) of  0.0000
	0.141197 has a count(s) of  0.0000
	0.668654 has a count(s) of  0.0000
	0.54172 has a count(s) of  0.0000
	0.106509 has a count(s) of  0.0000
	0.408232 has a count(s) of  0.0000
	0.1861 has a count(s) of  0.0000
	0.286726 has a count(s) of  0.0000
	0.231801 has a count(s) of  0.0000
	0.168574 has a count(s) of  0.0000
	0.684122 has a count(s) of  0.0000
	0.824634 has a count(s) of  0.0000
	0.566096 has a count(s) of  0.0000
	0.111724 has a count(s) of  0.0000
	0.335751 has a count(s) of  0.0000
	0.635486 has a count(s) of  0.0000
	0.38756 has a count(s) of  0.0000
	0.721604 has a count(s) of  0.0000
	0.6

	0.677379 has a count(s) of  0.0000
	0.516467 has a count(s) of  0.0000
	0.608466 has a count(s) of  0.0000
	0.721757 has a count(s) of  0.0000
	0.767868 has a count(s) of  0.0000
	0.719228 has a count(s) of  0.0000
	0.338098 has a count(s) of  0.0000
	0.3405 has a count(s) of  0.0000
	0.67286 has a count(s) of  0.0000
	0.55914 has a count(s) of  0.0000
	0.848795 has a count(s) of  0.0000
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Feature diagnostics (Feature value287)
	-0.25 has a count(s) of  0.9378
	0.0 has a count(s) of  0.0622
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Feature diagnostics (Feature value294)
	0.0 has a count(s) of  0.9146
	-0.25 has a count(s) of  0.0852
	0.575599 has a count(s) of  0.0000
	0.683816 has a count(s) of  0.0000
	0.807535 has a count(s) of  0.0000
	0.577095 has a count(s) of  0.0000
	0.30795 has a count(s) of  0.0000
	0.643405 has a count

In [16]:
len(to_drop)

36

In [17]:
new_X = X.drop(labels = to_drop, axis = 1)

In [18]:
### Convert data to PyTorch tensors
new_X, y = torch.from_numpy(new_X.values).to(torch.float32), torch.from_numpy(y).to(torch.float32)

In [19]:
new_data.dtypes[new_data.dtypes != np.float64]

Series([], dtype: object)

In [20]:
new_data.isnull().sum().sum()

0

In [21]:
dir(skorch)

['History',
 'MIN_TORCH_VERSION',
 'NeuralNet',
 'NeuralNetBinaryClassifier',
 'NeuralNetClassifier',
 'NeuralNetRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'callbacks',
 'classifier',
 'dataset',
 'exceptions',
 'history',
 'net',
 'parse_version',
 'pkg_resources',
 'regressor',
 'setter',
 'sys',
 'torch',
 'torch_version',
 'utils',

In [22]:
class MyNet(nn.Module):
    def __init__(self, n_features, task = 'classif', n_classes = None):
        super(MyNet, self).__init__()
        self.n_features = n_features
        self.task = task
        self.n_classes = n_classes
        
        if self.task == 'classif':
            assert self.n_classes is not None & type(self.n_classes) == int , 'Number of classes must be specified \
            if task is a Classification.'
        
        self.layer1 = nn.Linear(self.n_features, 2*self.n_features)
        self.layer2 = nn.Linear(2*self.n_features, 4*self.n_features)
        
        if self.task != 'classif':
            self.layer3 = nn.Linear(4*self.n_features, 1)
        else:
            self.layer3 = nn.Linear(4*self.n_features, self.n_classes)
        
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.softmax(self.layer3(x)) if self.task == 'classif' else self.layer3(x)
        
        return x

In [23]:
print(skorch.NeuralNetRegressor.__doc__)

NeuralNet for regression tasks

    Use this specifically if you have a standard regression task,
    with input data X and target y. y must be 2d.

    In addition to the parameters listed below, there are parameters
    with specific prefixes that are handled separately. To illustrate
    this, here is an example:

    >>> net = NeuralNet(
    ...    ...,
    ...    optimizer=torch.optimizer.SGD,
    ...    optimizer__momentum=0.95,
    ...)

    This way, when ``optimizer`` is initialized, :class:`.NeuralNet`
    will take care of setting the ``momentum`` parameter to 0.95.

    (Note that the double underscore notation in
    ``optimizer__momentum`` means that the parameter ``momentum``
    should be set on the object ``optimizer``. This is the same
    semantic as used by sklearn.)

    Furthermore, this allows to change those parameters later:

    ``net.set_params(optimizer__momentum=0.99)``

    This can be useful when you want to change certain parameters
    using a callback,

In [24]:
### Instantiate Skorch model
net = skorch.NeuralNetRegressor(module = MyNet(n_features = 348,
                                               task = 'regression'),
                                optimizer = torch.optim.SGD, optimizer__lr = 0.001,
                                max_epochs = 20, batch_size = 16, train_split = None)

In [25]:
### Split dataset for training, validation, and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.35)

In [26]:
X_train.shape

torch.Size([27820, 348])

In [27]:
net.fit(X_train, y_train)

  epoch    train_loss      dur
-------  ------------  -------
      1       [36m62.3015[0m  47.5030
      2       [36m11.5590[0m  44.7033
      3        [36m5.6381[0m  45.6393
      4        [36m3.4205[0m  46.7653
      5        [36m2.3414[0m  50.2796
      6        [36m1.7144[0m  47.3859
      7        [36m1.3254[0m  49.5965
      8        [36m1.0648[0m  46.1936
      9        [36m0.8855[0m  46.1685
     10        [36m0.7521[0m  44.9505
     11        [36m0.6544[0m  46.7621
     12        [36m0.5738[0m  46.5219
     13        [36m0.5123[0m  47.5097
     14        [36m0.4604[0m  46.6405
     15        [36m0.4173[0m  46.0819
     16        [36m0.3808[0m  46.9239
     17        [36m0.3520[0m  44.5908
     18        [36m0.3254[0m  40.1903
     19        [36m0.3028[0m  48.5742
     20        [36m0.2820[0m  50.6496


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=MyNet(
    (layer1): Linear(in_features=348, out_features=696, bias=True)
    (layer2): Linear(in_features=696, out_features=1392, bias=True)
    (layer3): Linear(in_features=1392, out_features=1, bias=True)
  ),
)

In [28]:
from sklearn.metrics import r2_score

In [29]:
### Train Accuracy
print(r2_score(y_train, net.predict(X_train)))

0.9993076654371598


In [30]:
### Validation Accuracy
print(r2_score(y_val, net.predict(X_val)))

0.9979037567794474


In [31]:
### Test Accuracy
print(r2_score(y_test, net.predict(X_test)))

0.9982135908127876


In [32]:
net._optimizers

['optimizer']

net.set_params(max_epochs = 20)
net.fit(X_train, y_train)

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline