<img src="../rsag_convex.png" alt="algoconvex" />
<img src="../x_update.png" alt="x_update" />
<img src="../mean.png" alt="mean" />
<img src="../rsag_composite.png" alt="algo" />

__Parameters :__
- $\alpha$: (1-$\alpha$) weight of aggregated x on current state, i.e. momentum
- $\lambda$: learning rate
- $\beta$: change for aggregated x
- $p_k$ termination probability



In [30]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import warnings
warnings.filterwarnings('ignore')
import time
import numpy as np
import sklearn

In [31]:
import path
import sys
sys.path.append('../')
from util import DataLoader


In [8]:
# packaging it all into a function
def preprocess_fashion_mnist():
  import random as rand


  (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
  mean_mat = np.mean(x_train, axis=0)

  # centering the data by removing the pixel wise mean from every pixel in every image
  x_train_centered = x_train - mean_mat
  x_test_centered = x_test - mean_mat

  # normalizing the grayscale values to values in interval [0,1]
  x_train_normalized = x_train_centered/255.0
  x_test_normalized = x_test_centered/255.0

  #finally, flattening the data
  x_train = np.reshape(x_train_normalized, (60000,784))
  x_test = np.reshape(x_test_normalized, (10000, 784))
  
  #converting the test data to one hot encodings
  y_train = keras.utils.to_categorical(y_train, num_classes=10)
  y_test = keras.utils.to_categorical(y_test, num_classes=10)

  return x_train, y_train, x_test, y_test
x_train, y_train, x_test, y_test = preprocess_fashion_mnist()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


# Model Implementation - Softmax

In [19]:
def evaluate_acc(pred, truth):
  counter =0

  for i in range(len(pred)):
    maxVal = np.where(pred[i] == np.amax(pred[i]))
    counter += 1 if maxVal == np.where(truth[i]==1) else 0
  return counter * 100.0 / float(len(pred))

Activation Function

In [9]:
#activation functions
softmax1D = lambda z: np.exp(z)/float(sum(np.exp(z)))
softmax2D = lambda z: np.array([np.exp(i)/float(sum(np.exp(i))) for i in z])
# relu = lambda y: y[y <= 0]=0
def relu(x):
  alpha = 0.1
  x=np.array(x).astype(float)
  # x[x<=0]=0.1*x
  np.putmask(x, x<0, alpha*x)
  return x
def relu_grad(x):
  alpha = 0.1
  x=np.array(x).astype(float)
  x[x>0]=1
  x[x<=0]=alpha
  return x

MLP

In [109]:
# sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size=64, learning_rate='constant', learning_rate_init=0.001, verbose=True)
logistic = lambda z: 1./ (1 + np.exp(-z))

class MLP:

    def __init__(self, M = 128, num_classes = 10, rsag=False):
        self.M = M
        self.num_classes = num_classes
        self.rsag = rsag

    def fit(self, x, y, optimizer, x_valid=None, y_valid=None):
        N,D = x.shape
        def gradient(x, y, params):
            w = params[0] # v.shape = (D, M), w.shape = (M)
            z = np.dot(x, w)
            yh = softmax2D(z)#N
            dy = yh - y #N
            train_acc = evaluate_acc(yh, y)
            dw = np.dot(x.T, dy)/N #M
            dparams = [dw]
            return dparams ,train_acc

        initializer = keras.initializers.GlorotNormal()
        w = initializer(shape=(D, self.num_classes))
        params0 = [w]

        if self.rsag:
            a_w = initializer(shape=(D, self.num_classes))
            aggr_params = [a_w]
            self.params, self.aggr_params, train_accs, batch_train_accs = optimizer.mini_batch_step(gradient, x, y, params0, aggr_params, x_val=x_valid, y_val=y_valid)
        else:
            self.params, train_accs, batch_train_accs = optimizer.run_mini_batch(gradient, x, y, params0)

        return self, train_accs, batch_train_accs

    def predict(self, x):
        # print('self:',self)
        # print('self==None:',self==None)
        w = self.params[0]
        # print(w.shape)
        # z = relu(np.dot(x, w)) #N x M
        yh = softmax2D(np.dot(x, w))#N
        return yh

### Mini batcher

In [10]:
def mini_batcher(x, y, mini_batch_size):
  zipped = np.hstack( (x, y ) )
  np.random.shuffle(zipped)
  x_batches, y_batches = [], []
  mini_batches = []
  batch_num = x.shape[0] // mini_batch_size
  for i in range(batch_num):
    x_batch = zipped[ i * mini_batch_size : (i+1) * mini_batch_size, :-10]
    y_batch = zipped[ i * mini_batch_size : (i+1) * mini_batch_size, -10:]
    mini_batches.append( ( x_batch, y_batch) )
    # mini_batches.append( ( x_batch, np.argmax(y_batch,axis=1)[:,None] ) )
  if x.shape[0] % mini_batch_size != 0:
    x_batch = zipped[ batch_num * mini_batch_size :, :-10]
    y_batch = zipped[ batch_num * mini_batch_size :, -10:]
    # print("Length of last mini-batch =", y_batch.shape[0])
    mini_batches.append( ( x_batch, y_batch ) )
    # mini_batches.append( ( x_batch, np.argmax(y_batch,axis=1) ) )
  # print(mini_batches[0])
  # print("yShape = ",y.shape)
  return mini_batches

In [25]:
class GradientDescent:

    def __init__(self, learning_rate=.001, max_iters=2e4, epsilon=1e-8, batch_size=32):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon

    def run(self, gradient_fn, x, y, params):
        norms = np.array([np.inf])
        t = 1
        while np.any(norms > self.epsilon) and t < self.max_iters:
            grad = gradient_fn(x, y, params)
            # print(grad[0].shape)
            # print(params[0].shape)
            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
            t += 1
            norms = np.array([np.linalg.norm(g) for g in grad])
        print(t)
        return params

    def run_mini_batch(self, gradient_fn, x, y, params, batch_size=32):
        train_acc, batch_train_acc, chunk = [], [], []
        norms = np.array([np.inf])
        t=1
        mini_batches = mini_batcher(x, y, batch_size)
        while np.any(norms > self.epsilon) and t < self.max_iters * len(mini_batches):
            x_temp, y_temp = mini_batches[t % ( len(mini_batches)-1 ) ][0], mini_batches[t % ( len(mini_batches)-1 ) ][1]
            grad, temp_acc = gradient_fn(x_temp, y_temp, params)
            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
            chunk.append(temp_acc)
            print(f"Epoch{t}:{temp_acc}%")
            train_acc.append( ( t, temp_acc ) )
            t += 1
            if t%len(mini_batches) == 2:
              batch_train_acc.append(np.mean(chunk))
              chunk = []
            norms = np.array([np.linalg.norm(g) for g in grad])
        return params, train_acc, batch_train_acc

# RSAG

In [117]:
class RSAG:

    def __init__(self, learning_rate=.001, alpha=0.009, beta=.000009, max_iters=2e4, epsilon=1e-8, batch_size=32):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon
        self.alpha = alpha  # momentum
        self.beta = beta 


    def run(self, gradient_fn, x, y, params, agg_params):
        norms = np.array([np.inf])
        t = 1
        while np.any(norms > self.epsilon) and t < self.max_iters:
            

            proj_params = [(1-self.alpha) * a_p + self.alpha * p for p, a_p in zip(params, agg_params)]
            grad = gradient_fn(x, y, proj_params)
            
            for p in range(len(params)):
                agg_params[p] -= self.beta * grad[p]
                params[p] -= self.learning_rate * grad[p]
            t += 1
            norms = np.array([np.linalg.norm(g) for g in grad])
        print(t)
        return params, agg_params

    def mini_batch_step(self, 
                       gradient_fn,
                       x, 
                       y,
                       params, 
                       agg_params, 
                       batch_size=32,
                       ):
        
        train_acc, batch_train_acc, chunk = [], [], []
        v_acc, v_mean_acc, v_chunk = [],  [], []
        norms = np.array([np.inf])
        t=0
        stable_cnt, base = 0, 0.0

        mini_batches = mini_batcher(x, y, batch_size)
        grad = None
        while np.any(norms > self.epsilon) and t < self.max_iters * len(mini_batches):

            x_temp, y_temp = mini_batches[t %  len(mini_batches) ][0], mini_batches[t % len(mini_batches) ][1]
            # x_val, y_val = mini_batches[t %  len(mini_batches)]

            proj_params = [(1-self.alpha) * a_p + self.alpha * p for p, a_p in zip(params, agg_params)]

            temp_grad, temp_acc = gradient_fn(x_temp, y_temp, proj_params)
            if grad == None: grad = temp_grad 
            else:
                for p in range(len(params)):
                    grad[p] += temp_grad[p]
                                
 
            # v_chunk.append(evaluate_acc())
            chunk.append(temp_acc)
            train_acc.append( ( t, temp_acc ) )

            if t%batch_size ==0:
                for p in range(len(params)):
                    agg_params[p] -= self.beta * (grad[p]/batch_size)
                    params[p] -= self.learning_rate * (grad[p]/batch_size)
                
            t += 1
            
            if t % (len(mini_batches)*batch_size) == 0: break # After an Epoch
           
            norms = np.array([np.linalg.norm(g) for g in grad])
        return params, agg_params, train_acc, batch_train_acc

In [105]:
model = MLP(M=128, num_classes=10)
optimizer = GradientDescent(learning_rate=.004, max_iters=2000, batch_size=64)
y_pred, train_accs, batch_train_accs = model.fit(x_train, y_train, optimizer)

Epoch1:15.625%
Epoch2:6.25%
Epoch3:18.75%
Epoch4:21.875%
Epoch5:6.25%
Epoch6:21.875%
Epoch7:15.625%
Epoch8:6.25%
Epoch9:12.5%
Epoch10:3.125%
Epoch11:12.5%
Epoch12:18.75%
Epoch13:3.125%
Epoch14:9.375%
Epoch15:9.375%
Epoch16:9.375%
Epoch17:21.875%
Epoch18:3.125%
Epoch19:9.375%
Epoch20:9.375%
Epoch21:21.875%
Epoch22:12.5%
Epoch23:12.5%
Epoch24:18.75%
Epoch25:9.375%
Epoch26:9.375%
Epoch27:9.375%
Epoch28:6.25%
Epoch29:9.375%
Epoch30:12.5%
Epoch31:9.375%
Epoch32:6.25%
Epoch33:6.25%
Epoch34:12.5%
Epoch35:9.375%
Epoch36:15.625%
Epoch37:6.25%
Epoch38:6.25%
Epoch39:9.375%
Epoch40:12.5%
Epoch41:9.375%
Epoch42:18.75%
Epoch43:12.5%
Epoch44:6.25%
Epoch45:12.5%
Epoch46:12.5%
Epoch47:6.25%
Epoch48:9.375%
Epoch49:18.75%
Epoch50:15.625%
Epoch51:15.625%
Epoch52:21.875%
Epoch53:3.125%
Epoch54:9.375%
Epoch55:6.25%
Epoch56:18.75%
Epoch57:3.125%
Epoch58:6.25%
Epoch59:3.125%
Epoch60:21.875%
Epoch61:6.25%
Epoch62:12.5%
Epoch63:9.375%
Epoch64:9.375%
Epoch65:18.75%
Epoch66:9.375%
Epoch67:21.875%
Epoch68:15.625%


KeyboardInterrupt: 

In [35]:
def hyper_tuning(x_train, y_train):
  from sklearn.model_selection import KFold
  import pandas as pd
  kf = KFold(5)
  acc_vals = []
  # hidden_units = [64, 128, 256, 512]
  # activations = [relu] #,leaky_relu, tanh ]
  learning_rate = [0.001, 0.002, 0.004]
  batch_size = [16, 32, 64]
  for btch in batch_size:
    print('batchsize:',btch)
    for lr in learning_rate:
      print('learningrate:',lr)
      optimizer = GradientDescent(learning_rate = lr, batch_size=btch)
      # for activ in activations:
      # for hu in hidden_units:   
      avg_acc = 0;       
      # print(f"for M=128, nonlinearity={activ}, lr={lr}, batch size={btch}.")
      start = time.time()
      for k, (train, test) in enumerate(kf.split(x_train, y_train)):
          print('k:',k)
          temp_model = MLP(M=128)
          temp_model.fit(x_train[train], y_train[train], optimizer)
          y_test_pred = temp_model.predict(x_train[test])
          temp_acc = evaluate_acc(y_test_pred, y_train[test])
          avg_acc += temp_acc
      avg_acc = avg_acc/5
      acc_vals.append(avg_acc)
      end = time.time()
      print('time elapsed:',(end-start)/60/60,"hrs")
      print('acc:',avg_acc)
      
  data = {'learningRate' : [0.001, 0.002, 0.004, 0.001, 0.002, 0.004, 0.001, 0.002, 0.004], 
          'batchSize':[16, 16, 16, 32, 32, 32, 64, 64, 64],
          'accuracies': acc_vals
          }
  acc = pd.DataFrame(data)
  print(acc)
  return acc

In [None]:
def train_rsag():
     
     

        batch_train_acc.append(np.mean(chunk))

        print(f"Epoch {t/(len(mini_batches)*batch_size)}: {batch_train_acc[-1]}%")

        if len(batch_train_acc)>1 and (np.abs(batch_train_acc[-1]-base) < .01):
            if stable_cnt>=2:
                print('STABLE TRAIN ACCURACY')
                break
            else: stable_cnt+=1
        else: 
            base = batch_train_acc[-1]
            stable_cnt = 0 
        
        # # Validation
        # if x_val is not None:
        #     val_acc = []
        #     for (x_v, y_v)  in zip(x_val, y_val):
        #         y_pred = predict_fn(x_v)
        #         val_acc.append(evaluate_acc(y_pred, y_v))
        #     v_mean_acc.append(np.mean(val_acc))
        #     v_acc.append(val_acc)
        #     print(f"MEAN VAL ACC: {v_mean_acc[-1]}")



        chunk = []

In [118]:
def hyper_tuning_rsag(x_train, 
                      y_train ,
                      x_valid=None,
                      y_valid=None):
  from sklearn.model_selection import KFold
  import pandas as pd
  kf = KFold(5)
  acc_vals = []
  # hidden_units = [64, 128, 256, 512]
  # activations = [relu] #,leaky_relu, tanh ]
  learning_rate = [0.001, 0.002, 0.004]
  alphas = [.9, .75, .7, .5]
  betas = [.001, .002, 0.004]
#   batch_size = [16, 32, 64]
  for alpha in alphas:
    print('alpha:',alpha)
    for beta in betas:
        for lr in learning_rate:
            print('--------New Model----------')
            print(f"learning rate: {lr}\t alpha: {alpha}\t beta:{beta}")
            optimizer = RSAG(learning_rate = lr, alpha=alpha, beta=beta, batch_size=64)
            # for activ in activations:
            # for hu in hidden_units:   
            avg_acc = 0;       
            # print(f"for M=128, nonlinearity={activ}, lr={lr}, batch size={btch}.")
            start = time.time()
            for k, (train, test) in enumerate(kf.split(x_train, y_train)):
                print('k:',k)
                temp_model = MLP(M=128, rsag=True)
                temp_model.fit(x_train[train], y_train[train], optimizer)

                temp_model = train(temp_model, optimizer, x_train[train], y_train[train])

                y_test_pred = temp_model.predict(x_train[test])
                temp_acc = evaluate_acc(y_test_pred, y_train[test])
                avg_acc += temp_acc
            avg_acc = avg_acc/5
            acc_vals.append(avg_acc)
            end = time.time()
            print('time elapsed:',(end-start)/60/60,"hrs")
            print('acc:',avg_acc)
  data = {'learningRate' : [0.001, 0.002, 0.004, 0.001, 0.002, 0.004, 0.001, 0.002, 0.004], 
          'batchSize':[16, 16, 16, 32, 32, 32, 64, 64, 64],
          'accuracies': acc_vals
          }
  acc = pd.DataFrame(data)
  print(acc)
  return acc

In [119]:
hyper_tuning_rsag(x_train=x_train[:2000], y_train=y_train[:2000], x_valid=x_train[2000:2200], y_valid=y_train[2000:2200])

alpha: 0.9
--------New Model----------
learning rate: 0.001	 alpha: 0.9	 beta:0.001
k: 0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
2


KeyboardInterrupt

