In [None]:
import torch
import torchvision
from google.colab import drive
drive.mount('/content/drive')
from PIL import Image
import torchvision.datasets as datasets
import torchvision.transforms.functional as TF
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

Mounted at /content/drive


In [None]:
import numpy

def binarize_image(image_tensor):
    image_tensor[image_tensor > 0.5] = 1
    image_tensor[image_tensor <= 0.5] = 0
    return image_tensor

def get_labels_probabilities(labels_occurences):
  uniq, counts = labels_occurences.unique(return_counts=True)
  return torch.div(counts, labels_occurences.shape[0])

def generate_probabilities_matrix(train_data_tensor, labels_training, labels_probabilities, num_classes):
  complete_matrix = torch.zeros(num_classes, train_data_tensor.shape[0])
  general_probabilities_matrix = []

  for k in range(0, num_classes):
    k_type_observations = train_data_tensor[:, labels_training == k]
    k_pixels_probabilities = []
    sum = torch.sum(k_type_observations, 1)
    tensor_k_pixels_probabilities = torch.div(sum, k_type_observations.shape[1])
    complete_matrix[k].add_(tensor_k_pixels_probabilities)
  return complete_matrix

def test_model(input_torch, p_m_pix_val_given_k, p_t_tensor, num_classes = 10):
    #assumes that the input comes in a row
    probs = torch.zeros(num_classes, input_torch.shape[0])
    idxsOnes = torch.nonzero(input_torch)
    idxsZeros = (input_torch == 0).nonzero()
    probs[:, idxsZeros] = torch.log(p_m_pix_val_given_k[0][:, idxsZeros])
    probs[:, idxsOnes] = torch.log(p_m_pix_val_given_k[1][:, idxsOnes])
    probs = probs + torch.log(p_t_tensor.view(num_classes, 1))
    scores_classes = torch.sum(probs, 1)
    return (torch.argmax(scores_classes).item(), scores_classes)
    

def test_model_batch(test_set, labels, p_m_pix_val_given_k, p_t_tensor):
    right_predictions = 0
    for image in range(0, test_set.shape[1]):
      (predicted_label, score_clasess) = test_model(test_set[:, image], p_m_pix_val_given_k, p_t_tensor, 10)
      if(predicted_label == labels[image]):
          right_predictions += 1
    return right_predictions / test_set.shape[1]


def train_model(train_data_tensor, labels_training, num_classes = 10):
    labels_probabilities = get_labels_probabilities(labels_training)
    matrix_probabilities_1_given_k = generate_probabilities_matrix(train_data_tensor, labels_training, labels_probabilities, num_classes)
    matrix_probabilities_0_given_k = 1 - matrix_probabilities_1_given_k
    p_m_pix_val_given_k = [matrix_probabilities_0_given_k, matrix_probabilities_1_given_k]
    return (p_m_pix_val_given_k, labels_probabilities)



def load_dataset(path = "/content/drive/MyDrive/Colab Notebooks/mnist_dataset/train"):
    #Open up the dataset
    dataset =  torchvision.datasets.ImageFolder(path)   
    list_images = dataset.imgs
    train_data_tensor  = None    
    labels_training = []
    first_tensor = True

    for i in range(0, len(list_images)):
        pair_path_label = list_images[i]        
        image = Image.open(pair_path_label[0]) 
        x_tensor = TF.to_tensor(image).squeeze()
        x_tensor_bin = binarize_image(x_tensor)       
        
        x_tensor_bin_plain = x_tensor_bin.view(x_tensor_bin.shape[0] * x_tensor_bin.shape[1], -1)      

        labels_training += [pair_path_label[1]]
        
        if(first_tensor):
            first_tensor = False
            train_data_tensor = x_tensor_bin_plain
        else:
            train_data_tensor = torch.cat((train_data_tensor, x_tensor_bin_plain), 1)
 
    return (train_data_tensor, torch.tensor(labels_training))       

    
(train_data_tensor, labels_tensor) = load_dataset(path = "/content/drive/MyDrive/Colab Notebooks/mnist_dataset/train")

p_m_pix_val_given_k , p_t_tensor = train_model(train_data_tensor, labels_tensor)

#Predecir el label de una observacion, la observacion #500
(predicted_label, scores_classes) = test_model(train_data_tensor[:, 500], p_m_pix_val_given_k, p_t_tensor, 10)

accuracy = test_model_batch(train_data_tensor, labels_tensor, p_m_pix_val_given_k, p_t_tensor)

print("The accuracy is", accuracy)

The accuracy is 0.9166666666666666


## Aciertos para 10 corridas
Particione los datos de forma aleatoria con 70 % de las observaciones para entrenamiento y 30 % para prueba (a partir de la carpeta train). Calcule la tasa de aciertos para 10 corridas, cada una con una particiÃ³n de entrenamiento y otra de prueba distintas

In [None]:
X = train_data_tensor.transpose(0,1)
# Split the matrix and labels for training and test partitions
# For reproductibility 
partitions_acc = []
for i in range(0, 10) :
  X_train, X_test, y_train, y_test = train_test_split(X, labels_tensor, test_size=0.3)
  p_m_pix_val_given_k , p_t_tensor = train_model(X_train.transpose(0,1), y_train)
  accuracy = test_model_batch(X_test.transpose(0,1), y_test, p_m_pix_val_given_k, p_t_tensor)
  print('epoch :%s, accuracy: %s'%(i, accuracy))

  partitions_acc.append(accuracy)

partitions_acc = np.array(partitions_acc)
print('')
print('partitions stats')
print('accuracy mean: %s'%(partitions_acc.mean()))
print('accuracy std: %s'%(partitions_acc.std()))

epoch :0, accuracy: 0.5388888888888889
epoch :1, accuracy: 0.45
epoch :2, accuracy: 0.5111111111111111
epoch :3, accuracy: 0.5055555555555555
epoch :4, accuracy: 0.46111111111111114
epoch :5, accuracy: 0.4166666666666667
epoch :6, accuracy: 0.5722222222222222
epoch :7, accuracy: 0.5166666666666667
epoch :8, accuracy: 0.4444444444444444
epoch :9, accuracy: 0.4166666666666667

partitions stats
accuracy mean: 0.4833333333333334
accuracy std: 0.0503690086961917


# Efecto del desbalanceo de los datos

In [None]:
X = train_data_tensor.transpose(0,1)

# The numbers of observations per class, for this scenario.
total_num_observations_per_class_train = [22, 22, 22, 22, 22, 42, 42, 42, 42, 42]
total_num_observations_per_class_test = [18, 18, 18, 18, 18, 18, 18, 18, 18, 18]

partitions_acc = []
for i in range(0, 10) :
  # Shuffle the data before getting the observations 
  indexes = torch.randperm(labels_tensor.shape[0])
  shuffle_X, shuffle_labels_tensor = X[indexes], labels_tensor[indexes]

  X_train, y_train, X_test, y_test = torch.Tensor([]), torch.Tensor([]), torch.Tensor([]), torch.Tensor([])

  num_observations_per_class_train = np.zeros(10, np.int8)
  num_observations_per_class_test = np.zeros(10, np.int8)

  # Create the training and test sets
  for j in range(0, len(shuffle_labels_tensor)):
    label = shuffle_labels_tensor[j]

    if num_observations_per_class_train[label] < total_num_observations_per_class_train[label]:
      if X_train.shape[0] == 0:
        X_train = torch.unsqueeze(shuffle_X[j], 0)
      else:
        X_train = torch.cat((X_train, torch.unsqueeze(shuffle_X[j], 0)), 0)
      
      y_train = torch.cat((y_train, torch.Tensor([shuffle_labels_tensor[j]])), 0)

      num_observations_per_class_train[label]+= 1
      
    elif num_observations_per_class_test[label] < total_num_observations_per_class_test[label]:
      if X_test.shape[0] == 0:
        X_test = torch.unsqueeze(shuffle_X[j], 0)
      else:
        X_test = torch.cat((X_test, torch.unsqueeze(shuffle_X[j], 0)), 0)
      
      y_test = torch.cat((y_test, torch.Tensor([shuffle_labels_tensor[j]])), 0)

      num_observations_per_class_test[label]+= 1

  # Train the model with the partition
  p_m_pix_val_given_k , p_t_tensor = train_model(X_train.transpose(0,1), y_train)
  # Get the accuracy of the model
  accuracy = test_model_batch(X_test.transpose(0,1), y_test, p_m_pix_val_given_k, p_t_tensor)
  print('epoch :%s, accuracy: %s'%(i, accuracy))

  partitions_acc.append(accuracy)

partitions_acc = np.array(partitions_acc)
print('')
print('partitions stats')
print('accuracy mean: %s'%(partitions_acc.mean()))
print('accuracy std: %s'%(partitions_acc.std()))



epoch :0, accuracy: 0.4777777777777778
epoch :1, accuracy: 0.43333333333333335
epoch :2, accuracy: 0.38333333333333336
epoch :3, accuracy: 0.4444444444444444
epoch :4, accuracy: 0.4166666666666667
epoch :5, accuracy: 0.42777777777777776
epoch :6, accuracy: 0.4388888888888889
epoch :7, accuracy: 0.45
epoch :8, accuracy: 0.4888888888888889
epoch :9, accuracy: 0.42777777777777776

partitions stats
accuracy mean: 0.43888888888888894
accuracy std: 0.02832788618662658
