<img src="../rsag_convex.png" alt="algoconvex" />
<img src="../x_update.png" alt="x_update" />
<img src="../mean.png" alt="mean" />
<img src="../rsag_composite.png" alt="algo" />

__Parameters :__
- $\alpha$: (1-$\alpha$) weight of aggregated x on current state, i.e. momentum
- $\lambda$: learning rate
- $\beta$: change for aggregated x
- $p_k$ termination probability



In [131]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import warnings
warnings.filterwarnings('ignore')
import time
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [31]:
import path
import sys
sys.path.append('../')
from util import DataLoader


In [260]:
# packaging it all into a function
def preprocess_fashion_mnist():
  import random as rand


  (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
  mean_mat = np.mean(x_train, axis=0)

  # centering the data by removing the pixel wise mean from every pixel in every image
  x_train_centered = x_train - mean_mat
  x_test_centered = x_test - mean_mat

  # normalizing the grayscale values to values in interval [0,1]
  x_train_normalized = x_train_centered/255.0
  x_test_normalized = x_test_centered/255.0

  #finally, flattening the data
  x_train = np.reshape(x_train_normalized, (60000,784))
  x_test = np.reshape(x_test_normalized, (10000, 784))
  
  #converting the test data to one hot encodings
  y_train = keras.utils.to_categorical(y_train, num_classes=10)
  y_test = keras.utils.to_categorical(y_test, num_classes=10)

  return x_train, y_train, x_test, y_test
x_train, y_train, x_test, y_test = preprocess_fashion_mnist()

x_train, x_valid, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Model Implementation - Softmax

In [19]:
def evaluate_acc(pred, truth):
  counter =0

  for i in range(len(pred)):
    maxVal = np.where(pred[i] == np.amax(pred[i]))
    counter += 1 if maxVal == np.where(truth[i]==1) else 0
  return counter * 100.0 / float(len(pred))

Activation Function

In [9]:
#activation functions
softmax1D = lambda z: np.exp(z)/float(sum(np.exp(z)))
softmax2D = lambda z: np.array([np.exp(i)/float(sum(np.exp(i))) for i in z])
# relu = lambda y: y[y <= 0]=0
def relu(x):
  alpha = 0.1
  x=np.array(x).astype(float)
  # x[x<=0]=0.1*x
  np.putmask(x, x<0, alpha*x)
  return x
def relu_grad(x):
  alpha = 0.1
  x=np.array(x).astype(float)
  x[x>0]=1
  x[x<=0]=alpha
  return x

MLP

In [262]:
# sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size=64, learning_rate='constant', learning_rate_init=0.001, verbose=True)
logistic = lambda z: 1./ (1 + np.exp(-z))

class MLP:

    def __init__(self, M = 128, num_classes = 10, rsag=False):
        self.M = M
        self.num_classes = num_classes
        self.rsag = rsag
        self.params = None
        self.aggr_params = None

    def fit(self, x, y, optimizer):
        N,D = x.shape
        def gradient(x, y, params):
            w = params[0] # v.shape = (D, M), w.shape = (M)
            z = np.dot(x, w)
            yh = softmax2D(z)#N
            dy = yh - y #N
            train_acc = evaluate_acc(yh, y)
            dw = np.dot(x.T, dy)/N #M
            dparams = [dw]
            return dparams ,train_acc
        
        if self.params is None:
            initializer = keras.initializers.GlorotNormal()
            w = initializer(shape=(D, self.num_classes))
            self.params = [w]
            if self.rsag:
                self.aggr_params = [np.copy(w)]
            print('params initialized')

        if self.rsag:
            self.params, self.aggr_params, train_accs, batch_train_acc = optimizer.mini_batch_step(gradient, x, y, self.params, self.aggr_params)
        else:
            self.params, train_accs, batch_train_acc = optimizer.mini_batch_step(gradient, x, y, self.params)

        return self, train_accs, batch_train_acc

    def predict(self, x):
        # print('self:',self)
        # print('self==None:',self==None)
        w = self.params[0]
        # print(w.shape)
        # z = relu(np.dot(x, w)) #N x M
        yh = softmax2D(np.dot(x, w))#N
        return yh

### Mini batcher

In [10]:
def mini_batcher(x, y, mini_batch_size):
  zipped = np.hstack( (x, y ) )
  np.random.shuffle(zipped)
  x_batches, y_batches = [], []
  mini_batches = []
  batch_num = x.shape[0] // mini_batch_size
  for i in range(batch_num):
    x_batch = zipped[ i * mini_batch_size : (i+1) * mini_batch_size, :-10]
    y_batch = zipped[ i * mini_batch_size : (i+1) * mini_batch_size, -10:]
    mini_batches.append( ( x_batch, y_batch) )
    # mini_batches.append( ( x_batch, np.argmax(y_batch,axis=1)[:,None] ) )
  if x.shape[0] % mini_batch_size != 0:
    x_batch = zipped[ batch_num * mini_batch_size :, :-10]
    y_batch = zipped[ batch_num * mini_batch_size :, -10:]
    # print("Length of last mini-batch =", y_batch.shape[0])
    mini_batches.append( ( x_batch, y_batch ) )
    # mini_batches.append( ( x_batch, np.argmax(y_batch,axis=1) ) )
  # print(mini_batches[0])
  # print("yShape = ",y.shape)
  return mini_batches

In [226]:
lr_lamda = lambda lr, t: lr/(1+t)

In [263]:
class GradientDescent:

    def __init__(self, 
                 learning_rate=.001, 
                 max_iters=2e4, 
                 epsilon=1e-8,
                 lr_fn = None,
                 batch_size=32):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon
        self.lr_fn = lr_fn
        self.t = 0

    # def run(self, gradient_fn, x, y, params):
    #     norms = np.array([np.inf])
    #     t = 1
    #     while np.any(norms > self.epsilon) and t < self.max_iters:
    #         grad = gradient_fn(x, y, params)
    #         # print(grad[0].shape)
    #         # print(params[0].shape)
    #         for p in range(len(params)):
    #             params[p] -= self.learning_rate * grad[p]
    #         t += 1
    #         norms = np.array([np.linalg.norm(g) for g in grad])
    #     print(t)
    #     return params

    def mini_batch_step(self, gradient_fn, x, y, params, batch_size=32):
        train_acc, batch_train_acc, chunk = [], [], []
        # norms = np.array([np.inf])

        mini_batches = mini_batcher(x, y, batch_size)
    
        if self.lr_fn is not None:
            self.learning_rate = self.lr_fn(self.learning_rate, self.t)
            print('New learning rate:', self.learning_rate)

        for x_temp, y_temp in mini_batches:
            # x_temp, y_temp = mini_batches[t % ( len(mini_batches)-1 ) ][0], mini_batches[t % ( len(mini_batches)-1 ) ][1]
            grad, temp_acc = gradient_fn(x_temp, y_temp, params)

            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
                
            chunk.append(temp_acc)
            # print(f"Epoch{t}:{temp_acc}%")
            train_acc.append( ( self.t, temp_acc ) )
            
        self.t += 1
        return params, train_acc, batch_train_acc

# RSAG

In [253]:
class RSAG:

    def __init__(self, 
                 learning_rate=.001, 
                 alpha=0.009, 
                 beta=.000009, 
                 max_iters=2e4, 
                 epsilon=1e-8, 
                 batch_size=32,
                 lr_fn = None
                 ):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon
        self.alpha = alpha  # momentum
        self.beta = beta 
        self.lr_fn = lr_fn
        self.t = 0


    # def run(self, gradient_fn, x, y, params, agg_params):
    #     norms = np.array([np.inf])
    #     t = 1
    #     while np.any(norms > self.epsilon) and t < self.max_iters:
            

    #         proj_params = [(1-self.alpha) * a_p + self.alpha * p for p, a_p in zip(params, agg_params)]
    #         grad = gradient_fn(x, y, proj_params)
            
    #         for p in range(len(params)):
    #             agg_params[p] -= self.beta * grad[p]
    #             params[p] -= self.learning_rate * grad[p]
    #         t += 1
    #         norms = np.array([np.linalg.norm(g) for g in grad])
    #     print(t)
    #     return params, agg_params

    def mini_batch_step(self, 
                       gradient_fn,
                       x, 
                       y,
                       params, 
                       agg_params,
                       batch_size=32,
                       ):
        
        train_acc, batch_train_acc, chunk = [], [], []
        v_acc, v_mean_acc, v_chunk = [],  [], []
        norms = np.array([np.inf])
        stable_cnt, base = 0, 0.0

        mini_batches = mini_batcher(x, y, batch_size)
        grad = None
        # while np.any(norms > self.epsilon) and t < self.max_iters * len(mini_batches):
        # print('len(mini_batches)', len(mini_batches))
        # print('')

        if self.lr_fn is not None:
            self.learning_rate = self.lr_fn(self.learning_rate, self.t)
            print('New learning rate:', self.learning_rate)

        for x_temp, y_temp in mini_batches:
                # x_temp, y_temp = mini_batches[t %  len(mini_batches) ][0], mini_batches[t % len(mini_batches) ][1]
                # print(x_temp.shape)
            # x_val, y_val = mini_batches[t %  len(mini_batches)]

            proj_params = [(1-self.alpha) * a_p + self.alpha * p for p, a_p in zip(params, agg_params)]

            grad, temp_acc = gradient_fn(x_temp, y_temp, proj_params)
            # if grad == None: grad = temp_grad 
            # else:
            # for p in range(len(params)):
            #     grad[p] += grad[p]
                                

            # v_chunk.append(evaluate_acc())
            chunk.append(temp_acc)
            train_acc.append( ( self.t, temp_acc ) )

            # if t%batch_size ==0:
            # for p in range(len(params)):
            #     agg_params[p] -= self.beta * (grad[p]/batch_size)
            #     params[p] -= self.learning_rate * (grad[p]/batch_size)
            for p in range(len(params)):
                agg_params[p] -= self.beta * (grad[p])
                params[p] -= self.learning_rate * (grad[p])

            
        self.t += 1
            
        return params, agg_params, train_acc, batch_train_acc

In [267]:
model = MLP(M=128, num_classes=10)
optimizer = GradientDescent(learning_rate=.006, max_iters=2000, batch_size=64)
y_pred, train_accs, batch_train_accs = train_model(model, optimizer, x_train, y_train, x_valid, y_test, print_every=10)

params initialized
acc: 3.1119791666666665
Epoch 0: 3.5416666666666665%
Epoch 10: 9.205729166666666%
Epoch 20: 20.9765625%
Epoch 30: 32.135416666666664%
Epoch 40: 40.677083333333336%
Epoch 50: 46.6796875%
Epoch 60: 51.119791666666664%


KeyboardInterrupt: 

In [265]:
model = MLP(M=128, num_classes=10, rsag=True)
optimizer = RSAG(learning_rate=.006, alpha=0.009, beta=0.000009, max_iters=2000, batch_size=64)
x_train, x_valid, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
y_pred, train_accs, batch_train_accs = train_model(model, optimizer, x_train, y_train, x_valid, y_test, print_every=10)

params initialized
acc: 11.692708333333334
Epoch 0: 12.161458333333334%
Epoch 10: 18.958333333333332%
Epoch 20: 28.658854166666668%
Epoch 30: 36.875%
Epoch 40: 42.8125%
Epoch 50: 47.265625%


KeyboardInterrupt: 

In [245]:
def hyper_tuning(x_train, y_train, lr_fn):
  from sklearn.model_selection import KFold
  import pandas as pd
  kf = KFold(5)
  acc_vals = []
  # hidden_units = [64, 128, 256, 512]
  # activations = [relu] #,leaky_relu, tanh ]
  learning_rate = [0.001, 0.002, 0.004]
  batch_size = [16, 32, 64]
  for btch in batch_size:
    print('batchsize:',btch)
    for lr in learning_rate:
      
      
      print('--------New Model----------')
      print(f"learning rate: {lr}\t Batch Size:{btch}")

      optimizer = GradientDescent(learning_rate = lr, batch_size=btch, lr_lamda=lr_fn)
      
      avg_acc = 0;       
      # print(f"for M=128, nonlinearity={activ}, lr={lr}, batch size={btch}.")
      start = time.time()
      for k, (train, test) in enumerate(kf.split(x_train, y_train)):
          print('k:',k)
          temp_model = MLP(M=128)
          temp_model, temp_acc = train_model(temp_model, optimizer, x_train[train], y_train[train])
          avg_acc += temp_acc
      avg_acc = avg_acc/5
      acc_vals.append(avg_acc)
      end = time.time()
      print('time elapsed:',(end-start)/60/60,"hrs")
      print('acc:',avg_acc)
      
  data = {'learningRate' : [0.001, 0.002, 0.004, 0.001, 0.002, 0.004, 0.001, 0.002, 0.004], 
          'batchSize':[16, 16, 16, 32, 32, 32, 64, 64, 64],
          'accuracies': acc_vals
          }
  acc = pd.DataFrame(data)
  print(acc)
  return acc

In [246]:
def hyper_tuning_gd(x_train, y_train, print_every=100, lr_fn=None):
  from sklearn.model_selection import KFold
  import pandas as pd
  kf = KFold(5)
  acc_vals = []
  # hidden_units = [64, 128, 256, 512]
  # activations = [relu] #,leaky_relu, tanh ]
  learning_rate = [0.001, 0.002, 0.004]
  batch_size = [16, 32, 64]
  for btch in batch_size:
    print('batchsize:',btch)
    for lr in learning_rate:
      
      
      print('--------New Model----------')
      print(f"learning rate: {lr}\t Batch Size:{btch}")

      optimizer = GradientDescent(learning_rate = lr, batch_size=btch, lr_fn=lr_fn)
      
      avg_acc = 0;       
      # print(f"for M=128, nonlinearity={activ}, lr={lr}, batch size={btch}.")
      start = time.time()
      for k, (train, valid) in enumerate(kf.split(x_train, y_train)):
          print('k:',k)
          temp_model = MLP(M=128)
          temp_model, temp_accs, max_acc = train_model(temp_model, optimizer, x_train[train], y_train[train], x_train[valid], y_train[valid], print_every=print_every)
          avg_acc += max_acc
      avg_acc = avg_acc/5
      acc_vals.append(avg_acc)
      end = time.time()
      print('time elapsed:',(end-start)/60/60,"hrs")
      print('acc:',avg_acc)
      
  data = {'learningRate' : [0.001, 0.002, 0.004, 0.001, 0.002, 0.004, 0.001, 0.002, 0.004], 
          'batchSize':[16, 16, 16, 32, 32, 32, 64, 64, 64],
          'accuracies': acc_vals
          }
  acc = pd.DataFrame(data)
  print(acc)
  return acc

In [248]:
hyper_tuning_gd(x_train=x_train[:10000], y_train=y_train[:10000], print_every=10) #x_valid=x_train[2000:2200], y_valid=y_train[2000:2200])

batchsize: 16
--------New Model----------
learning rate: 0.001	 Batch Size:16
k: 0
params initialized
acc: 9.6
Epoch 0: 9.6%
Epoch 100: 9.65%
Epoch 200: 9.65%
Epoch 300: 9.65%


KeyboardInterrupt: 

In [222]:
def hyper_tuning_rsag(x_train, 
                      y_train ,
                      x_valid=None,
                      y_valid=None):
  from sklearn.model_selection import KFold
  import pandas as pd
  kf = KFold(5)
  acc_vals = []
  # hidden_units = [64, 128, 256, 512]
  # activations = [relu] #,leaky_relu, tanh ]
  learning_rate = [0.001, 0.002, 0.004]
  alphas = [.9, .75, .7, .5]
  betas = [.001, .002, 0.004]
#   batch_size = [16, 32, 64]
  for alpha in alphas:
    print('alpha:',alpha)
    for beta in betas:
        for lr in learning_rate:
            print('--------New Model----------')
            print(f"learning rate: {lr}\t alpha: {alpha}\t beta:{beta}")
            optimizer = RSAG(learning_rate = lr, alpha=alpha, beta=beta, batch_size=64)
            # for activ in activations:
            # for hu in hidden_units:   
            avg_acc = 0;       
            # print(f"for M=128, nonlinearity={activ}, lr={lr}, batch size={btch}.")
            start = time.time()
            for k, (train, valid) in enumerate(kf.split(x_train, y_train)):
                print('k:',k)
                temp_model = MLP(M=128, rsag=True)
                temp_model, temp_accs, max_acc = train_model(temp_model, optimizer, x_train[train], y_train[train], x_train[valid], y_train[valid])
                avg_acc += max_acc
            avg_acc = avg_acc/5
            acc_vals.append(avg_acc)
            end = time.time()
            print('time elapsed:',(end-start)/60/60,"hrs")
            print('acc:',avg_acc)
  data = {'learningRate' : [0.001, 0.002, 0.004, 0.001, 0.002, 0.004, 0.001, 0.002, 0.004], 
          'batchSize':[16, 16, 16, 32, 32, 32, 64, 64, 64],
          'accuracies': acc_vals
          }
  acc = pd.DataFrame(data)
  print(acc)
  return acc

In [255]:
def train_model(model, optimizer, x_train, y_train, x_valid, y_valid, print_every=100):
    # x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    model.fit(x_train, y_train, optimizer)

    y_test_pred = model.predict(x_valid)
    temp_v_acc = evaluate_acc(y_test_pred, y_valid)
    v_acc = [temp_v_acc]
    print('acc:',temp_v_acc)

    t=0
    while t < optimizer.max_iters:
        model.fit(x_train, y_train, optimizer)
        # print('here')
        y_test_pred = model.predict(x_valid)
        temp_v_acc = evaluate_acc(y_test_pred, y_valid)
        v_acc.append(temp_v_acc)
        
        if t%print_every == 0:
            print(f"Epoch {t}: {v_acc[-1]}%")
        t+=1
        # if np.abs(v_acc[-1]-v_acc[-2]) < optimizer.epsilon:
            
        #     print('hfdsaafda')
        #     break
    return model, temp_v_acc, max(v_acc)

In [223]:
hyper_tuning_rsag(x_train=x_train[:2000], y_train=y_train[:2000]) #, x_valid=x_train[2000:2200], y_valid=y_train[2000:2200])

alpha: 0.9
--------New Model----------
learning rate: 0.001	 alpha: 0.9	 beta:0.001
k: 0
params initialized
acc: 13.0
Epoch0: 13.0%
Epoch100: 30.0%
Epoch200: 36.5%
Epoch300: 45.0%
Epoch400: 48.5%
Epoch500: 50.0%


KeyboardInterrupt: 