In dataPreprocessingNOTNIST.ipynb data preprocessing is done
in this notebook we start with normalization and model creation 

### Importing Module

In [1]:
import os
import pickle
from typing import List, Dict, Tuple
import numpy as np
import hashlib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
### set constants
NUM_CLASSES = 10 # total number of classes
IMAGE_SIZE = 28  # Pixel width and height.
PIXEL_DEPTH = 255.0  # Number of levels per pixel
SEED = 42 # for permutation in data shuffuling
TRAIN_SIZE_PER_LABEL = 30000 # take TRAIN_SIZE_PER_LABEL data for training from each sample
VALIDATION_SIZE_PER_LABEL = 20000 # take VALIDATION_SIZE_PER_LABEL amount for validation from each sample
TEST_SIZE = -1 # Negative Number represent All Data will be taken 
test_suffix = 'all' if TEST_SIZE < 0 else TEST_SIZE

In [3]:

ds_pickle_filepath = 'tr_set_{0}_val_{1}_test_{2}.pickle'.format(TRAIN_SIZE_PER_LABEL,
                                                                 VALIDATION_SIZE_PER_LABEL,
                                                                 test_suffix)

In [4]:
print(ds_pickle_filepath)

tr_set_30000_val_20000_test_all.pickle


### Load Data from already preprocessed pickle file

In [5]:
def load_dataset(filepath: str)->object:
    """
    load the data set from given pickle file
    
    paramter
    --------
        filepath: str
        pickle file path
    
    return pickle object
    """
    if not os.path.exists(filepath):
        raise Exception('pickle file is not present..')
        
    dataset = None
    with open(filepath, 'rb') as f:
        try:
            dataset = pickle.load(f)
        except EOFError:
            raise
    return dataset
# End

#### dataset comes in dictionary form Like
<pre>
{
    'trainingSet30000': training_set, # 30000 represent number of data of each sample
    'trainingLabel30000': training_label, # 30000 represent number of label of each sample
    'validationSet20000': validation_set, # 20000 represent number of data of each sample
    'validationLabel20000': validation_label, # 20000 represent number of label of each sample
    'testSetall': test_set, # all test data sample is used
    'testLabelall': test_label,  # all test data labels sample is used
    'classLabelEncodingMapping': CLASS_MAP # represent class label to integer mapping in label encoding
}
</pre>

In [6]:
dataset = load_dataset(ds_pickle_filepath)

In [7]:
training_set = dataset['trainingSet'+ str(TRAIN_SIZE_PER_LABEL)]
training_label = dataset['trainingLabel'+ str(TRAIN_SIZE_PER_LABEL)]
validation_set = dataset['validationSet'+ str(VALIDATION_SIZE_PER_LABEL)]
validation_label = dataset['validationLabel'+ str(VALIDATION_SIZE_PER_LABEL)]
test_set = dataset['testSetall']
test_label = dataset['testLabelall']
CLASS_MAP = dataset['classLabelEncodingMapping']
CLASS_INV_MAP = {val: key for key, val in CLASS_MAP.items()}

In [8]:
print('shape of training set    :-',training_set.shape)
print('shape of training label  :-', training_label.shape)
print('shape of validation set  :-',validation_set.shape)
print('shape of validation label:-', validation_label.shape)
print('shape of test set        :-',test_set.shape)
print('shape of training label  :-', test_label.shape)

shape of training set    :- (300000, 28, 28)
shape of training label  :- (300000, 10)
shape of validation set  :- (200000, 28, 28)
shape of validation label:- (200000, 10)
shape of test set        :- (18724, 28, 28)
shape of training label  :- (18724, 10)


datasets are randomize but not normalized we have to normalized all the data

In [9]:
def normalised_data(dataset: np.array)->np.array:
    """
     normalized and centered to have Local Centering mean is very small
     centers the pixel values
     
     parameter
     ---------
     
    dataset: np.array
         data in numpy array
    
    returns
    -------
        normalized np.array 
    
    """
    global PIXEL_DEPTH
    
    def norm(x: np.array)->np.array:
        """
        function for normalising each image data
        """
        x_ = x.astype('float64')/255.0
        nrm = x_ - x_.mean()
        return nrm
    # End
        
    data = np.array(list(map(norm, dataset)))
    return data
# End

Normalize and center the pixels values the train, validation and test data

In [10]:
X_tr, X_val, X_test =  normalised_data(training_set), normalised_data(validation_set), normalised_data(test_set)

we have to flaten the image array data since it is 2 dimension data for each image (IMAGE_SIZE, IMAGE_SIZE)

In [11]:
X_tr_flat = X_tr.reshape(X_tr.shape[0], IMAGE_SIZE*IMAGE_SIZE)
X_val_flat = X_val.reshape(X_val.shape[0], IMAGE_SIZE*IMAGE_SIZE)
X_test_flat = X_test.reshape(X_test.shape[0], IMAGE_SIZE*IMAGE_SIZE)

In [12]:
print('shape of training set   :-', X_tr_flat.shape)
print('shape of validation set :-', X_val_flat.shape)
print('shape of test set       :-', X_test_flat.shape)

shape of training set   :- (300000, 784)
shape of validation set :- (200000, 784)
shape of test set       :- (18724, 784)


### Logistic regression Model SkLearn

We will use Logistic regression first to see How traditional algorithm perform

since label is one hot encoded but we need label encoding
so we convert label data in label encoding

In [13]:
tr_int_label = np.array(list(map(np.argmax, training_label)))
val_int_label =  np.array(list(map(np.argmax, validation_label)))
test_int_label = np.array(list(map(np.argmax, test_label)))

In [14]:
def get_log_reg_filename(model: LogisticRegression, train_size: int)->str:
    """
    return the filename with path to save logistic regression model
    
    paramter
    --------
        model: sklearn.linear_model._logistic.LogisticRegression
            sklearn logistic regression model
        train_size: int train sample size of each class
    
    return
    ------
        str -> pickle file path filepath
    """
    hs = hashlib.sha256(str(model.get_params()).encode('utf-8')).hexdigest()
    filename = 'logistic_reg_{0}_{1}.pickle'.format(train_size, hs)
    return filename
# End

In [15]:
def save_logistic_regression_model(model: LogisticRegression, train_size: int,
                                   isalreadyfitted: bool, force: bool=False)->None:
    """
    save pickled logistic model information into file
    
    paramter
    --------
    model:  sklearn.linear_model._logistic.LogisticRegression
        sklearn logistic regression model
    
    train_size: int  
        number of training samples (per labels)
    
    """
    
    filepath = get_log_reg_filename(model, train_size)
    if os.path.exists(filepath) and not force:
        print('{0} file is already present skipping it...'.format(filepath))
        return
    
    data =  {
                'trainSampleSize': train_size,
                'modelParameter': model.get_params(),
                'model': model,
                'isAlreadyFitted': isalreadyfitted
            }
    print('saving data to pickle file')
    with open(filepath, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
# End

In [16]:
def get_logistic_regression_model(params: Dict[str, object], train_size: str)->Tuple[LogisticRegression, bool]:
    """
    return logistic regression model If pickled model already found return from file else create new one
        
    parameter
    ---------
        params: Dict[str, object]
            parameter for logistic regression model
        train_size: int
            number of training samples (per labels)
        
    return
    -------
        Tuple[sklearn.linear_model._logistic.LogisticRegression. bool] -> sklearn logistic regression model, is already fitted
    """
    
    logistic_reg_model = LogisticRegression()
    logistic_reg_model.set_params(**params)
    filename = get_log_reg_filename(logistic_reg_model, train_size)
    
    if os.path.exists(filename):
        data = None
        print('saved Model Found loading from pickle file..')
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        
        return (data['model'], data['isAlreadyFitted'])
    else:    
        print('saved model not found returning new model..')
        return (logistic_reg_model, False)

setting up the required parameter for Logistic regression

In [17]:
parameter = {
                'penalty':'l2',
                 'C':1.0,
                 'solver':'lbfgs',
                 'max_iter':1000,
                 'multi_class':'multinomial',
                 'verbose':1,
                 'n_jobs':-1
            }

loading the model and check if loaded model is already trained or not

In [18]:
logistic_reg_model, is_fitted = get_logistic_regression_model(parameter, TRAIN_SIZE_PER_LABEL)
print(logistic_reg_model)

saved Model Found loading from pickle file..
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)


if model is not trained before train it

In [19]:
if not is_fitted:
    logistic_reg_model.fit(X_tr_flat, tr_int_label)

saving model into file if already saved then it will skip, set force = True to overide it

In [20]:
save_logistic_regression_model(model=logistic_reg_model, train_size=TRAIN_SIZE_PER_LABEL, isalreadyfitted=True, force=False)

logistic_reg_30000_ebc80f6d2e18afef7fba8e3cbcc7c4c9df2152b31f4472d7045801787c845943.pickle file is already present skipping it...


we have validation and test set seperately because I am skipping hyper parameter tuning which 
require validation set and its label since it is very much time consuming you can create a
function to parameter tuning and save only the best model to disk

In [21]:
def get_score_and_predicted_label(data: np.array, true_label: np.array)->Tuple[np.array, float]:
    """
    return predicted labels and accuracy score 
    
    paramter
    --------
        data np.array
            flatted and normalised np.array of IMAGE_SIZE*IMAGE*SIZE dimesion
        true_label: np.array:
            flattern label encoded true label
    
    return
    -------
        Tuple[np.array, float]- > (predicted label encoded array, accuracy score)
    """
    y_pred = logistic_reg_model.predict(data)
    score = accuracy_score(true_label, y_pred)
    return (y_pred, score)
# End

In [22]:
validation_pred, score_validation = get_score_and_predicted_label(X_val_flat, val_int_label)

In [23]:
print('accuracy is {0}%'.format(score_validation*100))
misclassified_validation = np.sum(val_int_label != validation_pred)
correct_classified_validation = len(val_int_label) - misclassified_validation
print('total misclassified sample ', misclassified_validation)
print('total correct classified sample ', correct_classified_validation)

accuracy is 81.1635%
total misclassified sample  37673
total correct classified sample  162327


In [24]:
test_pred, score_test = get_score_and_predicted_label(X_test_flat, test_int_label)

In [25]:
print('accuracy is {0}'.format(score_test*100))
misclassified_test = np.sum(test_int_label != test_pred)
correct_classified_test = len(test_int_label) - misclassified_test
print('total misclassified sample ', misclassified_test)
print('total correct classified sample ', correct_classified_test)

accuracy is 89.43067720572527
total misclassified sample  1979
total correct classified sample  16745


### Multinomial Logistic regression using simplae gradient descent in tensorflow

In [26]:
graph = tf.Graph()

In [27]:
with graph.as_default():
    
    global IMAGE_SIZE, NUM_CLASSES, SEED
    learning_rate = 0.5
    ############### Data sets conversion into tensorflow constant ##########################
    tf_train_dataset = tf.constant(X_tr_flat, dtype=tf.float64)
    tf_valid_dataset = tf.constant(X_val_flat, dtype=tf.float64)
    tf_test_dataset = tf.constant(X_test_flat, dtype=tf.float64)
    tf_train_label = tf.constant(training_label, dtype=tf.float64)
    tf_validation_label = tf.constant(validation_label, dtype=tf.float64)
    
    ###################### weight matrix and Biases initialization ####################################
    
    # Initialized By using random values following a (truncated) normal distribution
    weights = tf.Variable(tf.random.truncated_normal((IMAGE_SIZE * IMAGE_SIZE, NUM_CLASSES),
                                                     seed=tf.random.set_seed(SEED), dtype=tf.float64),
                                                     dtype=tf.float64, trainable=True)
    
    biases = tf.Variable(tf.zeros([NUM_CLASSES] , dtype=tf.float64), trainable=True)
    
    # multiply weight matrix and add Bias
    logit = tf.add(tf.matmul(tf_train_dataset, weights), biases)
    
    # generate loss function
    # labels: Each row labels[i] must be a valid probability distribution
    # logits: Unscaled log probabilities.
    loss =  lambda : tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_label, logits=logit))
    
    # optimizer
    #optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate).minimize(loss, var_list=[weights,biases])
    
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.nn.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn,softmax(tf.nn.matmul(tf_test_dataset, weights) + biases)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


ValueError: No gradients provided for any variable: ['Variable:0', 'Variable_1:0'].

In [28]:
def build_computation_graph(X, y, X_val, y_val, X_test):
    graph = tf.Graph()
    IMAGE_SIZE, NUM_CLASSES, SEED = 28, 10, 42
    learning_rate = 0.5
    
    with graph.as_default():    
        ############### Data sets conversion into tensorflow constant ##########################
        tf_train_dataset = tf.constant(X, dtype=tf.float64)
        tf_valid_dataset = tf.constant(X_val, dtype=tf.float64)
        tf_test_dataset = tf.constant(X_test, dtype=tf.float64)
        tf_train_label = tf.constant(y, dtype=tf.float64)
        tf_validation_label = tf.constant(y_val, dtype=tf.float64)

        ###################### weight matrix and Biases initialization ####################################

        # Initialized By using random values following a (truncated) normal distribution
        weights = tf.Variable(tf.random.truncated_normal((IMAGE_SIZE * IMAGE_SIZE, NUM_CLASSES),
                                                         seed=tf.random.set_seed(SEED), dtype=tf.float64),
                                                         dtype=tf.float64, trainable=True)

        biases = tf.Variable(tf.zeros([NUM_CLASSES] , dtype=tf.float64), trainable=True)

        # multiply weight matrix and add Bias
        logit = tf.add(tf.matmul(tf_train_dataset, weights), biases)

        # generate loss function
        # labels: Each row labels[i] must be a valid probability distribution
        # logits: Unscaled log probabilities.
        loss =  lambda : tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_label, logits=logit))

        # optimizer
        #optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate).minimize(loss)
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate).minimize(loss, var_list=[weights,biases])

        train_prediction = tf.nn.softmax(logit)
        valid_prediction = tf.nn.softmax(tf.nn.matmul(tf_valid_dataset, weights) + biases)
        test_prediction = tf.nn,softmax(tf.nn.matmul(tf_test_dataset, weights) + biases)
        return graph
# End

In [29]:
X, y, X_v, y_v, X_t = X_tr_flat[:10000], training_label[: 10000],  X_val_flat[: 2000], validation_label[: 2000], X_test_flat[: 4000]

In [50]:
print('training data shape    ', X.shape)
print('training labels shape  ', y.shape)
print('validation data shape  ', X_v.shape)
print('validation labels shape', y_v.shape)
print('test data shape        ', X_t.shape)

training data shape     (10000, 784)
training labels shape   (10000, 10)
validation data shape   (2000, 784)
validation labels shape (2000, 10)
test data shape         (4000, 784)


In [51]:
build_computation_graph(X, y, X_v, y_v, X_t)

ValueError: No gradients provided for any variable: ['Variable:0', 'Variable_1:0'].

In [53]:
with open('Data_Sub/data_subset.pkl', 'rb') as f:
    data = pickle.load(f)

In [48]:
X, y, X_v, y_v, X_t = data['X'], data['y'], data['X_v'], data['y_v'], data['X_t']

In [46]:
datal['X'].shape

(10000, 784)

In [37]:
tf.__version__

'2.1.0'

In [40]:
import sys
print(sys.version)

3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
