# PySDDR Advanced Tutorial

Advanced tutorial that shows mixed effects of images and tabular features.

We will use the MNIST data set as a source for the image part and generate additional tabular feature.
We won’t tell the model the number of the MNIST picture, but instead feed the original image into to check whether the model can learn a latent effect representing the number.

In [1]:
# import the sddr module
from sddr import Sddr
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
import torch


import matplotlib.pyplot as plt
import seaborn as sns


#set seeds for reproducibility
torch.manual_seed(1)
np.random.seed(1)

## Preparation step

### load data

In [2]:
data_path = '../data/mnist_data/tab.csv'

data = pd.read_csv(data_path,delimiter=',')

# append a column for the numbers: each data point contains a file name of the corresponding image
for i in data.index:
    data.loc[i,'numbers'] = f'img_{i}.jpg'

### define SDDR parameters

In [3]:
# formulas of the statistical model
formulas = {'loc': '~ -1 + spline(x1, bs="bs", df=10) + x2 + dnn(numbers) + spline(x3, bs="bs", df=10)',
            'scale': '~1'
            }
distribution  = 'Normal'

# define the 'dnn' model
deep_models_dict = {
'dnn': {
    'model': nn.Sequential(nn.Flatten(1, -1),
                           nn.Linear(28*28,128),
                           nn.ReLU()),
    'output_shape': 128},
}


train_parameters = {
    'batch_size': 8000,
    'epochs': 1000,
    'degrees_of_freedom': {'loc':9.6, 'scale':9.6},
    'optimizer' : optim.Adam
}


# provide the location of the unstructured data
unstructured_data = {
  'numbers' : {
    'path' : '../data/mnist_data/mnist_images',
    'datatype' : 'image'
  }
}

output_dir = './outputs'


## Initialization and training of SDDR

In [None]:
sddr = Sddr(output_dir=output_dir,
            distribution=distribution,
            formulas=formulas,
            deep_models_dict=deep_models_dict,
            train_parameters=train_parameters,
            )

sddr.train(structured_data=data,
           target="y_gen",
           unstructured_data = unstructured_data,
          plot=True)

  return torch._C._cuda_getDeviceCount() > 0


Using device:  cpu
Beginning training ...
Train Epoch: 0 	 Loss: 131.097122
Train Epoch: 100 	 Loss: 101.242302
Train Epoch: 200 	 Loss: 76.320885
Train Epoch: 300 	 Loss: 57.496658


## Model saving and loading

In [None]:
# the model can also be saved
sddr.save('temp_simple_gam.pth')


In [None]:
## and a new model can be created using the saved parameters

# load trained SDDR and predict
train_parameters['epochs'] = 1100 # we increase the number of epochs, that the model should train

#we create a new model
sddr_resume = SDDR(output_dir=output_dir,
            distribution=distribution,
            formulas=formulas,
            deep_models_dict=deep_models_dict,
            train_parameters=train_parameters)

#and we load the saved network. In order for the preprocessing steps to be set up again, the training data is needed
sddr_resume.load('./outputs/temp_simple_gam.pth', data)

In [None]:
# after loading the training can be resumed
sddr_resume.train(target=target, structured_data=data, resume=True)

## Evaluation

In [None]:
#compute and plot partial effects as estimated by the splines
partial_effects_loc = sddr.eval('loc',plot=True)
partial_effects_scale = sddr.eval('scale',plot=True)

In [None]:
# compare prediction of neural network with ground truth

data_pred = data.loc[:,:]
ground_truth  = data.loc[:,'groundtruth']
# predict returns a distribution layer that gives statistical information about the prediction
distribution_layer, partial_effect = sddr.predict(data_pred,
                                                  clipping=True, 
                                                  plot=False, 
                                                  unstructured_data = unstructured_data)
predicted_mean = distribution_layer.loc[:,:].T
predicted_variance = distribution_layer.scale[0]

plt.scatter(ground_truth, predicted_mean)
print(f"Predicted variance for first sample: {predicted_variance}")

In [None]:
# check the if the model learned the correct correspondence of images and numbers

# we create a data set where we set all inputs but the images to be zero
data_pred2 = data.copy()

data_pred2.loc[:,'x1'] = 0
data_pred2.loc[:,'x2'] = 0
data_pred2.loc[:,'x3'] = 0
data_pred2

distribution_layer, partial_effect = sddr.predict(data_pred2,
                                                  clipping=True, 
                                                  plot=False, 
                                                  unstructured_data = unstructured_data)

data_pred2['predicted_number'] = distribution_layer.loc[:,:].numpy().flatten()

#we compare the true number on the images with the predicted number
sns.boxplot(x="y_true", y="predicted_number", data=data_pred2)
