In [None]:
import numpy as np
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, r'C:\Users\sab\Downloads\AI Testing\Source\Dorefanet\tensorpack\FullPrecisionModels')
import save_restore_images
import visualize_data
from tensorpack.dataflow.dataset.mnist import Mnist
import norm_distances_l2_ba as lpnorm 
from DataSets.mnist import GetMnist
import pandas as pd
import scipy.stats as stats
from matplotlib import pyplot
import scipy.stats as stats
from DataSets.cifar import Cifar10, get_cifar10_data, getaugmenteddata_with_all_images, _parse_meta
import logging
import os

In [None]:
image_folder = r"C:\Users\sab\Downloads\AI Testing\Source\Dorefanet\tensorpack\FullPrecisionModels\logs\trained_images\BA\NEW CODE\MNIST\MODEL_B_REGULARIZED_ONLY_CH_32\15_ITT\mnist-16,16,32"
image_name = r"mnist_conv_adv_pre-16,16,32--run-2.npz"


ba_logger = logging.getLogger("conversion_logger_{}".format(image_folder + image_name.split(".")[0]))
ba_logger.setLevel(logging.INFO)
fh = logging.FileHandler(os.path.join(image_folder, "conversion_log_{}.log".format(image_name.split(".")[0])))
fh.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s %(message)s')
fh.setFormatter(formatter)
ba_logger.addHandler(fh)

ba_logger.info("source file: {}".format(image_folder + image_name))


# Filter all images that did not converge. 
## Basic algorithm

1. The basic idea here is that since most of the images converged and only some were distorted beyond acceptance, we can assume that the data follows the "Normal Distribution". Meaning that most of the data points lie near the mean value while the completely distorted ones are outliers and lie far from mean value.

2. We checked for 10, 20 and 50 iterations for 100, 100 and 2000 data points and the data does follow the Normal Distribution. This means that the images that are completely distorted lie far away from the mean of the data.  

3. Thus, we first set a certain number of iterations and find adversarial samples as usual using __Boundary Attack Algorithm__ from the APP. This creates a .npz file containing the adversarial image, correct label and the image index (image index here is the index signifying image location (index) in the main db "MNIST" or "CIFAR" and not on the subsets).  

4. We then use the saved npz file to find l2 distance (because algorithm uses l2 norm for distance calculation) of each image with its adversarial counter-part (the perturbed image being generated from the Boundary Attack Algorithm for a certain amount of iterations).  

5. We then get an array of l2 distances (this array is in same sequence as the image_index).  

6. We then convert all the l2 distance to standard normal distribution using stats.zscore function provided by the python's scipy library. Here, _"conversion to standard normal distribution"_ means that we have an array of l2 distances, we then use the zscore formula to get the zscore of each of the l2 distances. The Zscore, as we know, gives us how much a data point is away from from the mean in terms of standard deviation (basically how many standard deviations is there in the diff between the data point and the mean).  

7. Since the data is stored sequentially, each l2 distance corresponds to the sequential image index  and each image index then corresponds to the corresponding image and labels.  

8. We then group each l2 distance with corresponding image_index and form a tuple of values.  

9. We then filter out (reject) all the image indexes that have z score more than 1 (we take 1 because the data we are operating on is quite small (2000) and it is better to remove highly distorted but still reconizable images rather than including convergence failed images). So we take everything on the left side whre z-score is less than 1. Everything on left side because z-score less than 0 is even better because this means data is smaller than mean (l2 is smaller than mean so less distortion)

10. This gives us a set of image_indices that have l2 distances close to the mean value meaning only images that that are not very highly distorted.

11. We then use the image index to filter the images and labels.

12. We then save the image, labels and image index to a new npz file which is then used for analysis.   

## Check data distribution

In [None]:
npz_image = save_restore_images.save_or_load_image__npz("load", image_folder + "\\"+ image_name)
adv_images = npz_image.images
labels = npz_image.labels
img_index = npz_image.image_index

ba_logger.info("########################## check data distribution###########################") 

print("Image Count: ", adv_images.shape)
ba_logger.info("Image Count: {}".format(adv_images.shape))
print("Labels Count: ", labels.shape)
ba_logger.info("Labels Count: {} ".format(labels.shape))
print("Index Count: ", img_index.shape)
ba_logger.info("Index Count: {}".format(img_index.shape))

l2_f, l2_dist, image_index = lpnorm.get_lp_norm_distances(adv_images, img_index)
print("Number of image_index returned from function:  {}".format(image_index.shape))
ba_logger.info("Number of image_index returned from function:  {}".format(image_index.shape))

print(np.array_equal(img_index, image_index))
l2 = float("{0:.4f}".format(l2_f))
print("Average l2 distance of the data points:", l2)
ba_logger.info("Average l2 distance of the data points:  {}".format(l2)) 


z_scores = stats.zscore(l2_dist)
visualize_data.plot_images(adv_images)
#enumerate each z_score. This makes it easy for debugging
#j = 0
#for zscore in z_scores:  
#    print("{} : {} ".format(j, zscore))
#    j = j +1

#j = 0
#for l2 in l2_dist:  
#    print("{} : {} ".format(j, l2))
#    j = j +1

#View histogram of l2 distance to view overall distribution of data
#pyplot.hist(l2_dist)
#view histogram after converting to standard normal distribution
#Both should give same plot but "pyplot.hist(z_scores)" will be defined in terms of z scores while the first one just gives distribution of data
n, bins, pack = pyplot.hist(z_scores)
print("n:", n)
print("bins:", bins)
ba_logger.info("n :  {}".format(n)) 
ba_logger.info("bins :  {}".format(bins)) 

print(image_index)

ba_logger.info("########################## data distribution check ended ###########################") 

## Perform filteration of images

In [None]:
ba_logger.info("########################## Filtering images ###########################") 

npz_image = save_restore_images.save_or_load_image__npz("load", image_folder + "\\"+ image_name)

adv_images = npz_image.images
labels = npz_image.labels
img_index = npz_image.image_index

print("Image Count: ", adv_images.shape)
print("Labels Count: ", labels.shape)
print("Index Count: ", img_index.shape)
ba_logger.info("Image Count:  {} ".format(adv_images.shape))
ba_logger.info("Labels Count:  {} ".format(labels.shape))
ba_logger.info("Index Count:   {}".format(img_index.shape)) 

#get an array containig l2 distances. The function "get_lp_norm_distances" is same as norm_distances.py but with l2_distance array
l2_f, l2_dist, image_index = lpnorm.get_lp_norm_distances(adv_images, img_index)

#check how many image index are returned and how many elements are there in l2_distance. BOTH SHOULD BE SAME
print("Number of image_index returned from function: ", image_index.shape)
print(np.array_equal(img_index, image_index))
l2 = float("{0:.4f}".format(l2_f))
print("Total number of l2 distances: ", len(l2_dist))
ba_logger.info("Total number of l2 distances: {}".format(len(l2_dist)))


#print average l2 distance of the data points
print("Average l2 distance of the data points:", l2)
ba_logger.info("Average l2 distance of the data points: {}".format(l2))


#compute z scores
z_scores = stats.zscore(l2_dist)

#Make a tuple of Z score and the image index. So each image is then paired with its corresponding Z score.
# eg (983, -3.144). This basically means that the adversarial image of the 983th image in the MNIST dataset is below the average l2 distance.
# 3 standard deviations away from mean (mean is zero in standard normal distrib) 
index_zscore = list(zip(image_index, z_scores))


#Get z-scores that need to be removed (just for debugging, this step)
filtered_zscore = z_scores[z_scores<1]
print("Number of allowed z_scores: ", len(filtered_zscore))
ba_logger.info("Number of allowed z_scores:  {}".format(len(filtered_zscore)))

#Get tuples that have z score less than 1 i.e within 1 standard deviation away from mean (on right side). On left side (negative) 
# it is better that l2 distance is less than average because this means the image is more closer to the original image and thus 
# algorithm is working for those images. 
# Note: since within 1 standard deviation, in a standard normal distribution, there are usually 68% data point (non skewed) so may
# be we get similar to that number (higher than that actually because we take all from left side of the curve). We have to take more 
# data points (in the initial npz) to get 2000 data points after filter

# Filtered_indices will have no tuples with z score >=1 (this step is also only for debug)
# Filtered indices is a tuple of (filtered_image_index, Z-score)
filtered_indices = [i for i in index_zscore if i[1]<1]
#print("Filtered data: ",filtered_indices)
print("Filtered z_scores and index count:", len(filtered_indices))
ba_logger.info("Filtered z_scores and index count:  {}".format(len(filtered_indices)))

#from the tuples get only the image index that have z score < 1
selected_image_index = [i[0]  for i in index_zscore if i[1]<1]
#selected_image_index = [i for i in index_zscore if i[1]>=1] #to check for only distorted images
#convert to np array
selected_image_index = np.array(selected_image_index)
#note: the filtered index count and the filtered z-scores count should be same
print("Filtered index count:", len(filtered_indices))
ba_logger.info("Filtered index count:  {}".format(len(filtered_indices)))

img_valid_z_score_img = []      #list containing all filtered images
img_valid_z_score_label = []    #list containing corresponding labels of filtered images
img_valid_z_score_index = []    #list containing corresponding image index (same as selected_image_index)

#get images and labels that are filtered according to the image index obtained
for select_index in selected_image_index:
    z_score_index = np.where(image_index == select_index)
    _image = adv_images[z_score_index][0]
    img_valid_z_score_img.append(_image)
    _label = labels[z_score_index][0]
    img_valid_z_score_label.append(_label)
    _index = img_index[z_score_index][0]
    img_valid_z_score_index.append(_index)

#convert to np array
img_valid_z_score_img = np.array(img_valid_z_score_img)
img_valid_z_score_label = np.array(img_valid_z_score_label)
img_valid_z_score_index = np.array(img_valid_z_score_index)

print("number of filtered images", img_valid_z_score_img.shape)
print("number of filtered labels", img_valid_z_score_label.shape)
print("number of indixes", img_valid_z_score_index.shape)

ba_logger.info("number of filtered images  {}".format(img_valid_z_score_img.shape))
ba_logger.info("number of filtered labels  {}".format(img_valid_z_score_label.shape))
ba_logger.info("number of indixes  {}".format(img_valid_z_score_index.shape))



print(len(img_valid_z_score_index)) 
print(len(selected_image_index))
print(np.array_equal(img_valid_z_score_index, selected_image_index))

ba_logger.info("is the computed index count matches with the index count added later:  {} ".format(np.array_equal(img_valid_z_score_index, selected_image_index)))

visualize_data.plot_images(img_valid_z_score_img)

#print(len(filtered_indices))
#print(filtered_indices)
#print(image_index)

# Save to a new npz file

ba_logger.info("########################## images filtered ###########################") 

## Get the updated l2 distance after filtering

In [None]:
#get lp_distances of the filtered images
updated_l2_f, updated_l2_dist, updated_image_index = lpnorm.get_lp_norm_distances(img_valid_z_score_img, img_valid_z_score_index)
updated_l2 = float("{0:.4f}".format(updated_l2_f))
print("Total number of --updated-- l2 distances: ", len(updated_l2_dist))
ba_logger.info("Total number of --updated-- l2 distances: {}".format(len(updated_l2_dist)))

#print average l2 distance of the data points
print("Average --updated after filter-- l2 distance of the data points:", updated_l2)
ba_logger.info("Average --updated after filter-- l2 distance of the data points: {}".format(updated_l2))
print(np.array_equal(updated_image_index, img_valid_z_score_index))

In [None]:
# Check individual images for QC
#print(img_valid_z_score_label)
np.set_printoptions(threshold=sys.maxsize)
print(image_index[96])
#print(image_index)

Mnist_Data_Dir = r"C:\Users\sab\Downloads\AI Testing\_Tools\DataSets\MNIST\Data"
data_test = GetMnist('test', dir=Mnist_Data_Dir)

visualize_data.plot_image(img_valid_z_score_img[96])
print(img_valid_z_score_label[96])
print(img_valid_z_score_index[96])
visualize_data.plot_image(data_test.images[3412])

In [None]:
##check individual indices for QC
np.set_printoptions(threshold=sys.maxsize)
#print(selected_image_index)
#print(npz_image.image_index)

ds = get_cifar10_data("test",dir= r"C:\Users\sab\Downloads\AI Testing\_Tools\DataSets\CIFAR10")
data_set = getaugmenteddata_with_all_images(ds)

visualize_data.plot_image(npz_image.images[4])
print(npz_image.labels[4])
print(npz_image.image_index[4])

visualize_data.plot_image(img_valid_z_score_img[96])
print(img_valid_z_score_label[96])
print(img_valid_z_score_index[96])

visualize_data.plot_image(data_set.images[9407])

## Save to a new file

In [None]:
ba_logger.info("########################## saving file ###########################") 
save_file_name = image_name.split(".")[0] + "__filtered.npz"
save_restore_images.save_or_load_image__npz(
    "save", 
    image_folder + "\\"+ save_file_name, 
    image= img_valid_z_score_img,    
    labels= img_valid_z_score_label , 
    image_index= img_valid_z_score_index)
ba_logger.info("saved to file:  {}".format(save_file_name))

ba_logger.info("########################## file saved ###########################") 

## TEST THE SAVED RESULTS

In [None]:
#Mnist_Data_Dir = r"C:\Users\sab\Downloads\AI Testing\_Tools\DataSets\MNIST\Data"
#data_test = GetMnist('test', dir=Mnist_Data_Dir)

ds = get_cifar10_data("test",dir= r"C:\Users\sab\Downloads\AI Testing\_Tools\DataSets\CIFAR10")
data_set = getaugmenteddata_with_all_images(ds)

f_npz_image = save_restore_images.save_or_load_image__npz("load", 
r"C:\Users\sab\Downloads\AI Testing\Source\Dorefanet\tensorpack\FullPrecisionModels\logs\trained_images\BA\NEW CODE\CIFAR\RESNET_5\12_ITT_3000_SAMPLES\cifar10-16,16,32\mnist_conv_adv_pre-16,16,32--run-2__filtered.npz")
print(f_npz_image.images.shape)
print(f_npz_image.labels.shape)
print(f_npz_image.image_index.shape)
f_images = f_npz_image.images
f_labels = f_npz_image.labels
f_image_index = f_npz_image.image_index

f_npz_image1 = save_restore_images.save_or_load_image__npz("load", 
r"C:\Users\sab\Downloads\AI Testing\Source\Dorefanet\tensorpack\FullPrecisionModels\logs\trained_images\BA\NEW CODE\CIFAR\RESNET_5\12_ITT_3000_SAMPLES\cifar10-16,16,32\mnist_conv_adv_pre-16,16,32--run-2__filtered.npz")

f_images1 = f_npz_image1.images
f_labels1 = f_npz_image1.labels
f_image_index1 = f_npz_image1.image_index

visualize_data.plot_image(f_images1[281])

print(f_labels1[281])
print(f_image_index1[281])

#print(npz_image.image_index[94])
#visualize_data.plot_image(npz_image.images[0])
#print(f_image_index)
#print(np.array_equal(f_labels, img_valid_z_score_label))
#print(np.array_equal(f_images, img_valid_z_score_img))
#print(np.array_equal(f_image_index, selected_image_index))
visualize_data.plot_image(data_set.images[8334])
print(data_set.labels[8334])
#visualize_data.plot_images(f_images)

In [None]:
from matplotlib import pyplot
l2 = [1,2,2,2,2,2,2,2,2,2,2,2,2,3,4]
l3 = np.array([2,4])
pyplot.hist(l2)
print(l2)
l2 = np.array(l2)
ind = [0,1]
print(l2[ind])


In [None]:
def test (inp, i1="1",i2=2):
    print(inp,i1,i2)

test("input", i2=2)

In [None]:
import art

print(art.__version__)