In [1]:
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import pickle
from torchvision.datasets import MNIST, EMNIST
from torch.utils.data import DataLoader
from torchvision.models import resnet50
import CandC


  from .autonotebook import tqdm as notebook_tqdm


## Set-up

Unfortunately, torch is not perfect and we cannot directly download the EMNIST data. We recommend downloading the binaries directly from the EMNIST website with the `Binary format as the original MNIST dataset` selected (https://www.nist.gov/itl/products-and-services/emnist-dataset), (https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip) or from the following link (https://marvinschmitt.com/blog/emnist-manual-loading/) (use wget -p ./data/EMNIST 'address'> or  curl -p ./data/EMNIST 'address').

Finally run:
`for file in *.gz; do
  gunzip -c "$file" > /path/to/destination/"${file%.gz}"
done`

If errors occur, make sure the unzipped files are inside ./data/EMNIST/raw


In [2]:
# Define transformations (e.g., normalization)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ResNet50 requires 224x224 input size
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


# Load MNIST dataset (digits)
train_mnist_data = MNIST(root='./data', train=True, download=True, transform=transform)
train_mnist_loader = DataLoader(train_mnist_data, batch_size=64, shuffle=True)
test_mnist_data = MNIST(root='./data', train=False, download=True, transform=transform)
test_mnist_loader = DataLoader(test_mnist_data, batch_size=64, shuffle=False)

# Load EMNIST dataset (letters)
# Ensure 'split' is set to 'letters' for alphabet characters

emnist_data = EMNIST(root=os.path.join(os.getcwd(),'data'), 
                     split='letters',
                     train=False,
                     download=False, 
                     transform=transform)
emnist_loader = DataLoader(emnist_data, batch_size=64, shuffle=False)


In [3]:
emnist_data

Dataset EMNIST
    Number of datapoints: 20800
    Root location: /home/jovyan/CandC_Framework/data
    Split: Test
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
               Grayscale(num_output_channels=1)
               ToTensor()
               Normalize(mean=(0.5,), std=(0.5,))
           )

In [4]:
model = resnet50(pretrained=True)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Modify the final fully connected layer to output 10 classes (for MNIST digits 0-9)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10)




In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [6]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=5, device='cpu'):
    model.to(device)
    model.train()  # Set the model to training mode

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Accumulate loss and accuracy
            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / total
        epoch_accuracy = correct / total

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

    print("Training complete.")

In [7]:
def test_model(model, test_loader, criterion, device='cpu'):
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Accumulate loss and accuracy
            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    test_loss = running_loss / total
    test_accuracy = correct / total

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    return test_loss, test_accuracy

## Train and test model on the MNIST data

In [8]:
# Train the model
if not 'mnist_ex.pt' in os.listdir(os.path.join(os.getcwd(),'tutorial')):
    train_model(model, train_mnist_loader, criterion, optimizer, num_epochs=5, device=device)
else:
    model = torch.load(os.path.join(os.getcwd(),'tutorial','mnist_ex.pt'))

In [9]:

# Test the model
test_model(model, test_mnist_loader, criterion, device=device)

Test Loss: 0.0281, Test Accuracy: 0.9905


(0.028100473149446772, 0.9905)

## Set up model uq

Now that we have our in-distribution and out-of-distribution test samples, let's run through the model uq interface.

First, let's set up the model uq environment


In [10]:
model_uq_params = dict({ 'name': 'mnist_example',
                        'device': device,
                        'model': model,
                        'data_address': os.path.join(os.getcwd(),'data'),
                        'model_uq_address': os.path.join(os.getcwd(),'example_model_uq'),
                        'tpr_threshold': 0.95,
                        'n_class' : 10, # This is because the MNIST dataset is our baseline, and this consists of the 10 digits
                       })

model_uq = CandC.model_uq.Model_UQ(**model_uq_params)

Now, importantly, omitted the model_address key-value in the model_uq_params. Since we want to access the model we have just trained, let's go ahead and save this model so that we may be able to load it in as needed.

Further, even though we omitted the model_address, the model_uq default creates on for us. Let's use that.

The default can be found at os.path.join(os.getcwd(),'model_address'). Let's update this as follows to save in the tutorial folder.

In [11]:
model_uq.model_address = os.path.join(os.getcwd(),'tutorial')

In [12]:
torch.save(obj=model,f=os.path.join(os.getcwd(),'tutorial','mnist_ex.pt'))

### Formatting the Input_Data and Output_Data

The Input_class has the following possible attributes:

1. 'name'
2. 'input_data_features'
3. 'input_dataloader'
4. 'classification_categories'
5. 'labeled_data'
6. 'input_data_labeled'
7. 'classification_scheme'
8. 'safevalues'

 
There are several ways to encode input_data and output_data for the Model_UQ object. The first decision one has to make is whether the input data is stored with a dataloader or with features and labels separately loaded as np.array or torch.Tensor objects.

Since we have used dataloaders, we'll go ahead and load in the objects with the dataloader as follows:



In [13]:
classification_categories=dict({
    0:0,
    1:1,
    2:2,
    3:3,
    4:4,
    5:5,
    6:6,
    7:7,
    8:8,
    9:9})
input_data=dict({
    'name': 'mnist_example',
    'input_dataloader':test_mnist_loader,
    'classification_categories': classification_categories,
    'labeled_data':True,
})

In [14]:
model_data= CandC.model_uq.data.Model_Data(model=model,data=input_data)


Initializing with provided data and model.
Filling in data labels from dataloader.


We have loaded in the model_data. This took the input data ditionary and the model we have trained, and produced an object which also contains our output data. Further, this data is softmax adjusted by default. The full suite of attributes generated is as follows:

In [15]:
model_data.to_dict().keys()

dict_keys(['name', 'input_dataloader', 'classification_categories', 'labeled_data', 'input_data_labeled', 'output', 'prediction'])

In [16]:
model_data.output

tensor([[1.4089e-12, 2.7384e-08, 9.7237e-08,  ..., 1.0000e+00, 3.5737e-14,
         1.9997e-07],
        [8.3562e-09, 3.5649e-09, 1.0000e+00,  ..., 1.6583e-09, 7.1121e-09,
         1.6743e-08],
        [1.3383e-08, 9.9996e-01, 3.3536e-05,  ..., 2.9982e-07, 3.9025e-07,
         1.0370e-08],
        ...,
        [8.4576e-11, 1.4826e-08, 3.5245e-08,  ..., 3.5793e-08, 2.5801e-07,
         1.3964e-07],
        [2.4115e-09, 1.2591e-09, 1.9071e-09,  ..., 1.0270e-08, 1.2999e-06,
         6.9387e-07],
        [1.5548e-07, 8.2569e-10, 1.4350e-06,  ..., 1.3982e-14, 3.2639e-06,
         3.7087e-07]])

In [17]:
model_data.prediction

tensor([7, 2, 1,  ..., 4, 5, 6])

In [18]:
model_data.labeled_data

True

In [19]:
model_data.input_data_labeled

tensor([7, 2, 1,  ..., 4, 5, 6])

Let's save this object in the data folder:


In [20]:
model_data.save(address=os.path.join(os.getcwd(),'data'))

In [21]:
model_data = CandC.model_uq.data.Model_Data(model_data_address=os.path.join(os.getcwd(),'data'))

Loading model_data object attributes.


Now that we've explored the model_data, let's generate the model uq interface

In [22]:
model_uq_params = dict({
    'name': 'mnist_example',
    'device': device,
    'model' : model,
    'model_address': os.path.join(os.getcwd(),'tutorial','ex_mnist.pt'),
    'data_address': os.path.join(os.getcwd(),'data'),
    'model_uq_address':os.path.join(os.getcwd(),'tutorial','model_uq'),
    'tpr_threshold':0.95,
    'n_class':10, # This technically should correspond to the number of keys in the classification_cat we use for the corresponding model_data we use as our baseline for statistics
})

In [23]:
model_uq = CandC.model_uq.Model_UQ(**model_uq_params)

In [24]:
alloutputs = model_uq.fill_uq(model_data=model_data,
                              certainty_dist_name= "mnist_example_certainty_dist",
                              return_assignment_df=True,
                              return_model_data=True,
                              return_certainties=True,
                              return_certainty_dist =True,
                              return_scores = True,
                              return_omicrons=True,
                             verbose=False)

Assignment DF saved at /home/jovyan/CandC_Framework/data
Loading model_data object attributes.
Now gathering certainties
Original certainty shape is torch.Size([10000, 10, 10]) from predictions shape torch.Size([10000, 10])
Finished gathering certainties.
The length of classification_cat is 10000
 the length of predictions is 10000
 the length of certainty scores is 10000
Generating scores object
Attempting to load assignment_df saved at /home/jovyan/CandC_Framework/data
Loading model_data object attributes.
Gathering APS
Gathering RAS
Gathering CK
Gathering PE
Gathering MIP
Gathering CC
Gathering EC
The length of classification_cat is 10000
 the length of predictions is 10000
 the length of certainty scores is 10000
Attempting to load assignment_df saved at /home/jovyan/CandC_Framework/data
Gathering Certainty Score stats
	 -Gathering in sample certainty scores


100%|██████████| 33/33 [00:02<00:00, 12.37it/s]


	 -Gathering MWU Results


100%|██████████| 9/9 [00:01<00:00,  5.20it/s]


The omicron gatherlist has 10 many categories
Gathering the omicrons for all observed labels


Gathering omicron data: 100%|██████████| 10/10 [00:00<00:00, 13.38it/s]


dict_keys(['input', 'distribution_status', 'tpr_threshold', 'logistic_params'])
There are 10000 total items.
 There are 9905 in-distribution items.
 There are 95 out-of-distribution/FP items
The accuracy of the log omicron model after fitting is 0.9907
Model saved at /home/jovyan/CandC_Framework/tutorial/model_uq


Now that we have filled in the initial uncertainty quantification information, we can explore the outputs that we have loaded into memory with the alloutputs object. 

By default no outputs are made, but we can specify that when applying the fill_uq methods that we can return any of the following as part of a key-value pair in a dictionary:
1. model_data
2. assignment_df
3. certainties
4. certainty distribution
5. scores
6. omicrons

In particular, we might want to see what the scores for our model are in order to gauge it's quality.

In [25]:
scores = alloutputs['scores'].scores

In order to simplify our work, we've just extracted the saved scores dictionary that is an attribute of the Scores object stored in the alloutputs object returned by the model_uq.fill_uq method.

Let's now examine the scores of the model.

In [26]:
scores

{'MCA': 0.9905,
 'TP': 9905,
 'FP': 95,
 'F1': array([0.9959142 , 0.99296394, 0.98800959, 0.993083  , 0.98680203,
        0.99159664, 0.99112272, 0.98321816, 0.99282051, 0.98956781]),
 'APS': 0.9905537373639556,
 'RAS': 0.9999566569432103,
 'CK': 0.9894403117344469,
 'PE': None,
 'MIP': None,
 'CC': array(0.8860517, dtype=float32),
 'EC': 0.9742994140625}

We observe the following:

Accuracy is .9905

There are 9905 TP

There are 95 FP

The F1 scores within each category are superlative: 
\n [0.9959142 , 0.99296394, 0.98800959, 0.993083  , 0.98680203,
        0.99159664, 0.99112272, 0.98321816, 0.99282051, 0.98956781],
        
The average precision score 'APS' is 0.9905537373639556,

The average AUROC score ('RAS') is 0.9999566569432103

Cohen's Kappa 'CK' is 0.9894403117344469

The component competence  'CC' is 0.8861,

whereas the empirical 'EC' is 0.9742994140625


Since there are 10 categories, we conclude that the model is Expert, but not prescient, as the empirical competence is above the component competence.

Now, let's dig in a little further to see the MWU test scores relative to the TP and FP for each category.

In [27]:
empirical_competencies=alloutputs['scores'].empirical_competencies
in_sample_cert_scores = alloutputs['scores'].in_sample_cert_scores


In [28]:
empirical_competencies

{0: 0.9877104651952326,
 1: 0.9751871458996378,
 2: 0.9625806962436194,
 3: 0.9835620406111316,
 4: 0.9652657450934653,
 5: 0.9836474308641867,
 6: 0.9834291145221068,
 7: 0.9611088020768098,
 8: 0.9831020167616548,
 9: 0.9595922948829682}

In [29]:
in_sample_cert_scores['MWU Results']

{'1 Global scores': MannwhitneyuResult(statistic=array([6753.]), pvalue=array([2.63747956e-05])),
 '2 Global scores': MannwhitneyuResult(statistic=array([2044.]), pvalue=array([0.0159561])),
 '3 Global scores': MannwhitneyuResult(statistic=array([5016.]), pvalue=array([0.00010729])),
 '4 Global scores': MannwhitneyuResult(statistic=array([9622.]), pvalue=array([6.83349024e-08])),
 '5 Global scores': MannwhitneyuResult(statistic=array([6079.]), pvalue=array([1.13116159e-05])),
 '6 Global scores': MannwhitneyuResult(statistic=array([8499.]), pvalue=array([2.27317547e-07])),
 '7 Global scores': MannwhitneyuResult(statistic=array([30893.]), pvalue=array([1.46844476e-19])),
 '8 Global scores': MannwhitneyuResult(statistic=array([5786.]), pvalue=array([2.49008035e-05])),
 '9 Global scores': MannwhitneyuResult(statistic=array([12405.]), pvalue=array([1.33959478e-08]))}

First, we see that category by category, we have extremely high empirical competence scores. It is worth pointing out that there are two contributors to this high empirical competence score, the first is that certainty scores are extremely high in most cases, and FP occur extremely infrequently. Relying solely on the certainty score on a one-off basis is not the best guarantee of detection for a FP. 

When looking at the distribution of certainty scores for TP and FPs within each category, we find the MWU statistic and associated in p-values for all categories are sufficiently high and low respectively that we may reject the null-hypothesis that TP and FP are drawn from the same distribution.


#### Deeper Dive Into FP Certainties

At this point we have several objects that we can examine to get a better sense of how the model fails.

We will first look into the category which has the most FPs.


In [30]:
assignment_df=alloutputs['assignment_df'].data
assignment_df_fp = assignment_df.loc[assignment_df.predictive_status=='FP']

In [31]:
assignment_df_fp.prediction.value_counts()

prediction
2    23
4    16
1    10
3     9
5     8
8     8
9     8
6     8
0     3
7     2
Name: count, dtype: int64

We see that the '2' digit has the most FPs (23). Let's now examine the certainties for the 2 category

In [32]:
tp_2_indices = assignment_df.loc[(assignment_df.prediction==2) & (assignment_df.predictive_status=='TP')].index.to_list()
fp_2_indices=assignment_df_fp.loc[assignment_df_fp.prediction==2].index.to_list()

In [33]:
certainties = alloutputs['certainties']

In [34]:
tp_2_output = certainties.output[tp_2_indices]
tp_2_certainties = certainties.certainty[tp_2_indices]
fp_2_output = certainties.output[fp_2_indices]
fp_2_certainties = certainties.certainty[fp_2_indices]

In [35]:
tp_2_output[0]

tensor([8.3562e-09, 3.5649e-09, 1.0000e+00, 1.3180e-08, 8.3749e-10, 1.1831e-09,
        2.5862e-09, 1.6583e-09, 7.1121e-09, 1.6743e-08])

In [36]:
fp_2_output[0]

tensor([4.1300e-07, 2.0458e-04, 8.4215e-01, 1.0680e-06, 1.5226e-01, 1.8312e-05,
        5.2740e-03, 1.5429e-06, 8.5194e-05, 1.2366e-06])

We observe that the first TP example provides an effective absolute confidence that the input is a '2', whereas the first FP example, although strongly predicting the input is a 2 (conditioned on the input features, the softmax adjusted probability is that .84215 that this input should be a 2), we nonetheless see at least one other alternative that has roughly the same order of magnitude, the '4' digit.

We can expand this analysis out further, using the flattened certainties.  Since these are reduced from a 10x10 matrices to a vector of length 45, let's check the first tp_2 certainty and the first fp_2 certainty

In [37]:
tp_2_certainties[0]

tensor([ 4.7913e-09, -1.0000e+00, -4.8236e-09,  7.5188e-09,  7.1731e-09,
         5.7701e-09,  6.6979e-09,  1.2442e-09, -8.3871e-09, -1.0000e+00,
        -9.6150e-09,  2.7274e-09,  2.3818e-09,  9.7873e-10,  1.9066e-09,
        -3.5472e-09, -1.3178e-08,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.2342e-08,
         1.1997e-08,  1.0594e-08,  1.1522e-08,  6.0678e-09, -3.5635e-09,
        -3.4561e-10, -1.7487e-09, -8.2084e-10, -6.2746e-09, -1.5906e-08,
        -1.4031e-09, -4.7523e-10, -5.9290e-09, -1.5560e-08,  9.2785e-10,
        -4.5259e-09, -1.4157e-08, -5.4538e-09, -1.5085e-08, -9.6312e-09])

In [38]:
fp_2_certainties[0]

tensor([-2.0416e-04, -8.4215e-01, -6.5502e-07, -1.5226e-01, -1.7899e-05,
        -5.2736e-03, -1.1299e-06, -8.4781e-05, -8.2363e-07, -8.4195e-01,
         2.0351e-04, -1.5206e-01,  1.8627e-04, -5.0694e-03,  2.0303e-04,
         1.1938e-04,  2.0334e-04,  8.4215e-01,  6.8989e-01,  8.4213e-01,
         8.3688e-01,  8.4215e-01,  8.4207e-01,  8.4215e-01, -1.5226e-01,
        -1.7244e-05, -5.2729e-03, -4.7484e-07, -8.4126e-05, -1.6860e-07,
         1.5224e-01,  1.4699e-01,  1.5226e-01,  1.5218e-01,  1.5226e-01,
        -5.2557e-03,  1.6769e-05, -6.6882e-05,  1.7075e-05,  5.2724e-03,
         5.1888e-03,  5.2727e-03, -8.3651e-05,  3.0623e-07,  8.3958e-05])

We get some sense immediately that the true positive has more entries whose absolute value is near 1, indicating near absolute certainty of the given prediction, whereas the FP certainty has no clear indication of that being the case. 

Setting up our motivation for the omicron score, let's do a quick comparison between the first pair of TP certainties, FP certainties and pair of TP against FP certainties

In [39]:
for i in range(3):
    print("The norm of the {} and {} TP is {}".format(i,i+1, torch.norm(tp_2_certainties[i]-tp_2_certainties[i+1])))
    print("The norm of the {} and {} FP is {}".format(i,i+1, torch.norm(fp_2_certainties[i]-fp_2_certainties[i+1])))
    print("The norm of the {} TP against the {} FP is {}".format(i,i, torch.norm(tp_2_certainties[i]-fp_2_certainties[i])))

The norm of the 0 and 1 TP is 5.3419153545064546e-08
The norm of the 0 and 1 FP is 0.9168991446495056
The norm of the 0 TP against the 0 FP is 0.6937397718429565
The norm of the 1 and 2 TP is 0.0004051334981340915
The norm of the 1 and 2 FP is 0.36251553893089294
The norm of the 1 TP against the 1 FP is 1.0553745031356812
The norm of the 2 and 3 TP is 0.00932549312710762
The norm of the 2 and 3 FP is 0.7669994235038757
The norm of the 2 TP against the 2 FP is 1.4160065650939941


From a relatively small sample, we see that the certainties of the pairs of TPs are relatively close to one another in a large dimensional space, whereas the FP pairs are are several of magnitudes farther apart than their TP counterparts. 

Finally, the pairs of TP and FP certainties are similarly spaced farther apart than the TP pairs, and several of the FP pairs.

We may start to anticipate that 'average' distance between the two predictive statuses should be detectable. Further, by the central limit theorem, the 'averages' of these norms ought to be normally distributed, so that one can distinguish distributions of high-dimensional data with respect to a univariate distribution. This is, in effect, the entire motivation for the calculation and application of the omicron statistic.

In [40]:
omicrons = alloutputs['omicrons'].omicrons

Now that we've loaded in the omicrons, let's compute the distance between the two subsamples of  the certainties for the TP and FP examples:

In [41]:
tp_fp_2_omicrons = CandC.oodd.omicrons.omicron_fn(omicrons[2]['TP']['certainty_sample'],omicrons[2]['FP']['certainty_sample'])

Now let's look at the mean and standard deviation of each respective omicron sample

In [42]:
print("\tMean\t| Var\n TP:\t{}\t|\t {}\t\n FP:\t {}\t|\t {} \n TPvFP:\t{} \t|\t{}\n".format(omicrons[2]['TP']['omicrons'].mean(),
                 omicrons[2]['TP']['omicrons'].std()**2,
                 omicrons[2]['FP']['omicrons'].mean(),
                 omicrons[2]['FP']['omicrons'].std()**2,
                 tp_fp_2_omicrons.mean(),
                 tp_fp_2_omicrons.std()**2))

	Mean	| Var
 TP:	0.03432707488536835	|	 0.01810862496495247	
 FP:	 1.1486209630966187	|	 0.09058060497045517 
 TPvFP:	1.438718557357788 	|	0.0001247012405656278



Examination of the omicron scores for the '2' digit provides the following insights:

1. Within the TP predictions, the average distance from the certainties is quite low, although there is significant variance given how close the mean is to 0;
2. Within the FP predictions, the average distance from other TP certainties is quite high, indicating both that the model is not consistently replicating the errors that lead to a False Prediction, and does so because it considers multiple distinct alternatives, in contrast to the TP cases where there is little variation in the plausible alternatives; 
3. The average distance of the TP predictions from the FP predictions is even higher, with an extremely low variance relative to this distance. This lends credibility to the idea that we may use the omicron scores to distinguish a TP from a FP, or an out of distribution example, relative to the use of known in-distribution TPs and an unknown input, indicating that although the observed FPs are far apart from one another, they are further from the TPs, even if predicted to belong to the same label as the TPs.

Before proceeding with the out of distribution examples, let's look at the omicrons across all categories

In [43]:
for cat in sorted(list(omicrons.keys())):
    tp_fp_omicrons = CandC.oodd.omicrons.omicron_fn(omicrons[cat]['TP']['certainty_sample'],omicrons[cat]['FP']['certainty_sample'])
    print("{}\tMean\t| Var\n TP:\t{}\t|\t {}\t\n FP:\t {}\t|\t {} \n TPvFP:\t{} \t|\t{}\n".format(cat,
                                                                                                  omicrons[cat]['TP']['omicrons'].mean(),
                 omicrons[cat]['TP']['omicrons'].std()**2,
                 omicrons[cat]['FP']['omicrons'].mean(),
                 omicrons[cat]['FP']['omicrons'].std()**2,
                 tp_fp_omicrons.mean(),
                 tp_fp_omicrons.std()**2))

0	Mean	| Var
 TP:	0.03686104342341423	|	 0.017980149015784264	
 FP:	 1.0735294818878174	|	 0.029328159987926483 
 TPvFP:	2.0284767150878906 	|	0.0004666907771024853

1	Mean	| Var
 TP:	0.05027231201529503	|	 0.02661757729947567	
 FP:	 1.021998643875122	|	 0.0815143957734108 
 TPvFP:	0.9597761034965515 	|	0.00021724987891502678

2	Mean	| Var
 TP:	0.03432707488536835	|	 0.01810862496495247	
 FP:	 1.1486209630966187	|	 0.09058060497045517 
 TPvFP:	1.438718557357788 	|	0.0001247012405656278

3	Mean	| Var
 TP:	0.012892150320112705	|	 0.005865542218089104	
 FP:	 1.1047720909118652	|	 0.06275364756584167 
 TPvFP:	0.926129162311554 	|	5.230105671216734e-06

4	Mean	| Var
 TP:	0.03816358000040054	|	 0.015980929136276245	
 FP:	 0.9698021411895752	|	 0.09351544082164764 
 TPvFP:	0.7662228941917419 	|	0.0001458017504774034

5	Mean	| Var
 TP:	0.021444860845804214	|	 0.007656562607735395	
 FP:	 1.1257102489471436	|	 0.021330269053578377 
 TPvFP:	1.7247533798217773 	|	8.657295984448865e-05

6	Mean	| Va

Finally, we do not look at omicrons 'globally'. These statistics will be less reliable because the certainties for each predicted category are necessarily concentrated around the values predicted by the category, and so there will be a natural distance betweeen all true positive examples that will necessarily skew the sample.

## Examination of EMNIST data

So far we have looked at the model we have trained and its performance on data which we know is a priori drawn from the training and validation distribution. Further, we have seen that this model performs extremely well on the in-distribution data. What we want now are to see how the model performs on data that is a priori out-of-distribution due to belonging to different labels/unknown labels, with features that are sufficiently different from the training features. 

Specifically, these will be the 'letters' or 'alphabetical' characters in the EMNIST data set.

In [44]:
emnist_model_data_params=dict({
    'name': 'emnist_example',
    'input_dataloader':emnist_loader,
    'labeled_data':True,
})

emnist_model_data = CandC.model_uq.data.Model_Data(model=model,data=emnist_model_data_params)


Initializing with provided data and model.
Filling in data labels from dataloader.


In [45]:
emnist_model_data.save(address=os.path.join(os.getcwd(),'data'),filename="emnist_data")

With the emnist_model_data saved, we first show how to get the certainties, certainty_score, and predictions independently of a Certainties object:

In [46]:
emnist_certainties,emnist_certainty_scores, emnist_predictions = CandC.candc.get_certainty(emnist_model_data.output)

In [47]:
emnist_certainty_scores.histogram()

torch.return_types.histogram(
hist=tensor([ 137.,  129.,  136.,  128.,  133.,  133.,  139.,  125.,  106.,  133.,
         127.,  126.,  129.,  108.,  129.,  119.,  133.,  101.,  121.,  120.,
         111.,  115.,  101.,  112.,  106.,  124.,  108.,  106.,  110.,  129.,
         104.,  129.,  117.,  104.,  120.,  125.,  103.,  109.,  117.,  122.,
         130.,  117.,  125.,  111.,  107.,  123.,  120.,  138.,  120.,  123.,
         108.,  110.,  127.,  100.,  126.,  114.,  140.,  127.,  121.,  103.,
         129.,  142.,  119.,  131.,  130.,  119.,  147.,  118.,  134.,  125.,
         129.,  130.,  162.,  143.,  165.,  148.,  138.,  147.,  167.,  212.,
         173.,  174.,  173.,  170.,  178.,  206.,  232.,  214.,  233.,  246.,
         280.,  312.,  330.,  354.,  366.,  420.,  541.,  706., 1060., 4423.]),
bin_edges=tensor([2.8133e-05, 1.0028e-02, 2.0028e-02, 3.0027e-02, 4.0027e-02, 5.0027e-02,
        6.0026e-02, 7.0026e-02, 8.0026e-02, 9.0026e-02, 1.0003e-01, 1.1003e-01,
        1.200

A preliminary look at the histogram data shows that there is some concentration towards the upper bounds of the certainty score, but otherwise the values are diffused across 0 to 1. Let's look at the MNIST certainty score histogram in contrast

In [48]:
torch.Tensor(alloutputs['certainty_dist'].data.certainty_score).histogram()

torch.return_types.histogram(
hist=tensor([3.0000e+00, 3.0000e+00, 4.0000e+00, 3.0000e+00, 3.0000e+00, 4.0000e+00,
        4.0000e+00, 2.0000e+00, 4.0000e+00, 2.0000e+00, 2.0000e+00, 2.0000e+00,
        3.0000e+00, 1.0000e+00, 0.0000e+00, 5.0000e+00, 1.0000e+00, 2.0000e+00,
        2.0000e+00, 0.0000e+00, 2.0000e+00, 3.0000e+00, 6.0000e+00, 2.0000e+00,
        1.0000e+00, 2.0000e+00, 3.0000e+00, 5.0000e+00, 5.0000e+00, 1.0000e+00,
        0.0000e+00, 3.0000e+00, 3.0000e+00, 2.0000e+00, 3.0000e+00, 0.0000e+00,
        1.0000e+00, 2.0000e+00, 3.0000e+00, 4.0000e+00, 1.0000e+00, 0.0000e+00,
        1.0000e+00, 2.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 2.0000e+00,
        2.0000e+00, 2.0000e+00, 2.0000e+00, 2.0000e+00, 2.0000e+00, 4.0000e+00,
        3.0000e+00, 4.0000e+00, 3.0000e+00, 4.0000e+00, 3.0000e+00, 1.0000e+00,
        4.0000e+00, 3.0000e+00, 4.0000e+00, 4.0000e+00, 3.0000e+00, 3.0000e+00,
        1.0000e+00, 0.0000e+00, 1.2000e+01, 1.0000e+01, 4.0000e+00, 1.0000e+01,
     

Despite being half the size of the EMNIST data, we find that the MNIST data is substantially clustered within the right-most certainty score bin (>99%). In fact, over 91% of the MNIST data is highly certain, whereas only approximately 22% of the out-of-distribution EMNIST data is similarly (mis)classified with high certainty.



Let's look in detail at the EMNIST data which has the highest certainty scores.

In [49]:
labels,counts=emnist_model_data.input_data_labeled[(emnist_certainty_scores>9.9e-01).flatten()].unique(return_counts=True)

In [50]:
print("The {} letter is the one that is (mis)classified {} times with the highest certainty".format(counts.argmax().item(),counts.max().item()))

The 22 letter is the one that is (mis)classified 714 times with the highest certainty


We see that the most confidently assigned out-of-distribution letter is 'v'

In [51]:
llabels,lcounts=emnist_model_data.input_data_labeled[(emnist_certainty_scores<1e-02).flatten()].unique(return_counts=True)

In [52]:
print("The {} letter is the one that is (mis)classified {} times with the lowest certainty".format(lcounts.argmax().item(),lcounts.max().item()))

The 12 letter is the one that is (mis)classified 11 times with the lowest certainty


On the otherhand, the letter 'l' is (mis)classified with certainty below 0.01 most frequently.

Now let's proceed to first us the above model_uq's associated internal tests. In order to do this, we'll need to form the 'external_data', in this case, the Certainties for the emnist_model_data 

In [53]:
emnist_certainties = CandC.model_uq.data.Certainties()
emnist_certainties.gather_certainties(output_data=emnist_model_data)

Now gathering certainties
Original certainty shape is torch.Size([20800, 10, 10]) from predictions shape torch.Size([20800, 10])
Finished gathering certainties.


In [54]:
oodd_params=dict({'external_data_name':'emnist_data',
                 'internal_certainties':alloutputs['certainties'],
                 'internal_omicrons':alloutputs['omicrons'],
                 'external_data':emnist_certainties,
                 'scores':alloutputs['scores']})

model_uq.run_oodd_tests_internal(**oodd_params)
                                 

Loading in the external data
Starting Omicron tests
There are 30800 total items.
 There are 10000 in-distribution items.
 There are 20800 out-of-distribution/FP items
The accuracy of the log omicron model after fitting is 0.7636038961038961
INTERNAL OMICRON TEST
----------------------------
The scores for the omicron test are tensor([0.0034, 0.0034, 0.0034,  ..., 0.9483, 0.9505, 0.9533])
 The labels are tensor([0, 0, 0,  ..., 0, 0, 0])
The tpr tensor is tensor([0.0000e+00, 4.7858e-05, 9.5717e-05,  ..., 9.9990e-01, 9.9995e-01,
        1.0000e+00])
INTERNAL ONLY: With cut-off at 0.95, 
 fpr tensor([0., 0., 0.,  ..., 1., 1., 1.]) 
 tpr tensor([0.0000e+00, 4.7858e-05, 9.5717e-05,  ..., 9.9990e-01, 9.9995e-01,
        1.0000e+00]), the corresponding idx is 24820, 
with fpr95tpr 1.0
Omicron Test Results 
{'AUROC': tensor(0.8150), 'AUPR-IN': tensor(0.9214), 'AUPR-OUT': tensor(0.5276), 'FPR95TPR': tensor(1.)}
EXTERNAL OMICRON TEST
----------------------------
The scores for the omicron test ar

We see that running the 'internal' OODD tests produces results for two types of Omicron test models. The first uses the internal omicron test that is calibrated to distinguish TP from FP values, whereas the second test, the External test, is calibrated to distinguish in-distribution (MNIST) data from out-of-distribution, or novel sample data (here labeled from the EMNIST dataset). Importantly, the accuracy of the log omicron model that is reported above describes the External model's accuracy.  To examine the accuracy of the Internal model, let's run the following commands:

In [55]:
with open(os.path.join(os.getcwd(),'tutorial','model_uq','emnist_data_oodd_test_results'),'rb') as handle:
    oodd_test_results = pickle.load(handle)

In [56]:
emnist_omicron_internal_test_scores=oodd_test_results['Omicron Test Results']['internal test scores']

As the omicron test is an oodd test, FP and OOD labels are valued at 1 and in-distribution or TPs are labeled as 0 when training the logistic model. Further, the outputs are determined straightforwardly by projecting down the output probability vector onto the first probability component. For this reason, we can, if we use a .5 threshold, identify the accuracy of the internal model by the following command

In [57]:
(emnist_omicron_internal_test_scores<.5).float().mean().item()

0.90064936876297

That is, with respect to the EMNIST data, a little over 90% of the EMNIST data using the Internal Omicron test, can be identified relative to the in-predicted category, in-distribution TP MNIST data, as being a FP. This is determined exactly as 90.06% of the EMNIST sample is seen to have a pseudo-probability for the 'TP' label under 50%, whence the pseudo-probability for FP is at least 50%. Let's look at the external model in contrast:

In [58]:
print("Only {} of the out-of-distribution data was identified\n Only {} of the in-distribution data was identified".format((oodd_test_results['Omicron Test Results']['external test scores'][:-10000]<.5).float().mean().item(),
      (oodd_test_results['Omicron Test Results']['external test scores'][-10000:]<.5).float().mean().item()))

Only 0.2358173131942749 of the out-of-distribution data was identified
 Only 0.7623999714851379 of the in-distribution data was identified


In contrast with the Internal Test, we find that the external test does rather poorly with the default .5 cutoff.

The intuitive reason for this is that we find the EMNIST data is similar enough to the omicrons of the FPs relative to the omicrons of the TPs. When we collapse the entire MNIST sample to the 0 label, the inclusion of the FP certainties in the sample we use when computing the omicrons brings the out of distribution data 'closer' on average to the MNIST data.

For this reason, we should generally prefer to use the Internal Omicron Test.