In [8]:
import gzip
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
import dataloader as dl
import random
import pickle
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from torchvision.transforms import Compose, ToTensor, Resize, Normalize
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

from utils import *
from models import *
from DFA2SRN import machine_proces, dfa2srn

Target parameters 

In [9]:
J = np.log(2)*128
T_U  = J*torch.tensor([[2,0],[0,2],[2,0],[0,2]])
T_Ub =  torch.tensor([0,0,0,0])
T_W  = -J*torch.tensor([[3,3,1,1],[1,1,3,3],[1,1,3,3],[3,3,1,1]])
T_Wb =  torch.tensor([0,0,0,0])
T_V  = J*torch.tensor([-1,-1,1,1]) 
T_Vb = torch.tensor([0])

target = [T_U,T_Ub,T_W,T_Wb,T_V,T_Vb]

Generating the training data for: "the language of all words with an odd number of a's" on the alphabet {a,b}

In [None]:
train_data = []

training_set = []
for k in range(1,50):
    if k==1:
        training_set += random_strings(k,2)
    elif 2<=k<=5:
        training_set += random_strings(k,4)
    else:
        training_set += random_strings(k,10)
print(len(training_set))


Training loop 

In [None]:

reg_pl_loss     = []
reg_pl_norm_2   = []
reg_pl_norm_inf = []
reg_pl_dist     = []
reg_pl_para     = []
q_h = Quasy_H()
net = SRN(4,2,1,num_layers=1, activation= 'sig').to(dtype = torch.float32)
B = False
A = True
j=0
for j in range(20000): ## in the article the number of batches is 20000
    if j%100 == 0:
        print("Epoch :"+str(j))
    (plot_loss,plot_norm_2,plot_norm_inf,plot_dist,plot_para,A) = gD_regular(net,target,q_h ,training_set,0.001,60,B)
    reg_pl_loss += plot_loss
    reg_pl_norm_2 += plot_norm_2
    reg_pl_norm_inf += plot_norm_inf
    reg_pl_dist += plot_dist
    reg_pl_para += plot_para

Ploting the training statistics

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4,figsize=(28,6))

fig.tight_layout(w_pad=2)

K = 26
plt.rcParams.update({'font.size': 15})
ax1.plot(reg_pl_loss, color='#007291')
ax1.plot(rolling_avg(reg_pl_loss,3),color='#d8460b')
ax1.set_title("Loss", fontsize = K)

ax2.plot(reg_pl_norm_2, color='#007291')
ax2.plot(rolling_avg(reg_pl_norm_2,3),color='#d8460b')
ax2.set_title("$\|\\mathbf{p}_{k} - \\mathbf{p}_{k-1}\|$", fontsize = K)

ax3.plot(reg_pl_norm_inf, color='#007291')
ax3.plot(rolling_avg(reg_pl_norm_inf,3),color='#d8460b')
ax3.set_title("$ \|\\nabla \mathcal{L}(\mathcal{R}_{\\mathbf{p}_k})\|_\infty $", fontsize = K)

ax4.plot(reg_pl_dist,color='#007291')
ax4.set_title("$\|\\mathbf{p}_k - \\mathbf{p}_{Target}\|$", fontsize = K)

plt.savefig('Regular_SGD_these.pdf')
plt.show()

SGD experiment on ML_Reg_Test data set.

In [21]:
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
names_path = "names.txt"
path = "data/Small/"

names, ext = lang_names(names_path)

In [None]:
dc = {}
for j in range(len(names)):
    name = names[j]
    print("In processe "+ name)
    train_loader, val_loader, test_loader, input_length, seq_leng = data_creator2(name, path, ext)
    print('The data is done')

    ### defining the target parameters 
    machine_path = "data/machines/"
    machine_name = name[:-1]+'.att'
    T_m, D_m, num_state, alphabet_size = machine_proces(machine_path, machine_name)
    target = dfa2srn(T_m, D_m)
    

    print('Starting the training')
    model = customSRN(hidden_dim = num_state*alphabet_size ,input_dim=input_length, output_dim=1,seq_length=seq_leng,device=device,dtyp=32)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), # or any optimizer you prefer 
                            lr= 0.01, # 0.001 is used if no lr is specified
                            momentum= 0.87)
    ls_names = ['plot_loss','plot_norm','plot_norm2','plot_dist']
    stats_data = STATS(ls_names)
    train_stats(model,target, stats_data, optimizer, 32, train_loader, val_loader, 20000, 10**(-5), 0.01, dtype, device)
    dc[name] = stats_data.get_data()

Ploting the obtained data from training

In [None]:
fig, ax = plt.subplots(1,4,figsize=(16.18,3.3))
fig.tight_layout(w_pad=2)
plt.rcParams.update({'font.size': 15})
for j in range(1):
    name = names[j]
    (plot_loss,plot_norm,plot_norm2,plot_dist) = dc[name]
    l_min, l_max = np.min(plot_loss), np.max(plot_loss)
    i_min, i_max = np.min(plot_norm), np.max(plot_norm)
    n_min, n_max = np.min(plot_norm2), np.max(plot_norm2)

    ## evolution of the Loss
    ax[j][0].set_yscale("log")
    ax[j][0].plot(plot_loss, color='#007291')
    ax[j][0].set_ylabel(names[j][:-1],size=18)
    ax[j][0].plot(rolling_avg(plot_loss,3),color='#d8460b')
    ax[j][0].set_title("Loss")

    ## Distance between two consecutive parameter vectors
    ax[j][1].set_yscale("log")
    ax[j][1].plot(plot_norm2, color='#007291')
    ax[j][1].plot(rolling_avg(plot_norm2,3),color='#d8460b')
    ax[j][1].set_title("$\|\mathbf{p}_{k+1}- \mathbf{p}_k\|_2$")

    ## Infinite norme of the gradient
    ax[j][2].set_yscale("log")
    ax[j][2].plot(plot_norm, color='#007291')
    ax[j][2].plot(rolling_avg(plot_norm,3),color='#d8460b')
    ax[j][2].set_title("$\|\\nabla\mathcal{L}(\mathcal{R}_{\mathbf{p}_k})\|_\infty$")

    ## distance to the target
    ax[j][3].plot(plot_dist, color='#007291')
    ax[j][3].set_title("$\|\mathbf{p}_{k}- \mathbf{p}_{Target}\|_2$")
    ax[j][0].set_ylabel(name)
# plt.savefig('SGD_9_FL.pdf')
plt.show()

SGD experiment on ML_Reg_Test data set the training statistics that we have obtained.

In [None]:
with open("stats_20k.pkl", 'rb') as picklefile:
    data_dict = pickle.load(picklefile)
fig, ax = plt.subplots(9,4,figsize=(16.18,30))
fig.tight_layout(w_pad=2)
plt.rcParams.update({'font.size': 15})
for j in range(9):
    name = names[j]
    (plot_loss,plot_norm,plot_norm2,plot_dist) = data_dict[name]
    l_min, l_max = np.min(plot_loss), np.max(plot_loss)
    i_min, i_max = np.min(plot_norm), np.max(plot_norm)
    n_min, n_max = np.min(plot_norm2), np.max(plot_norm2)

    ## evolution of the Loss
    ax[j][0].set_yscale("log")
    ax[j][0].plot(plot_loss, color='#007291')
    ax[j][0].set_ylabel(names[j][:-1],size=18)
    ax[j][0].plot(rolling_avg(plot_loss,3),color='#d8460b')
    ax[j][0].set_title("Loss")

    ## Distance between two consecutive parameter vectors
    ax[j][1].set_yscale("log")
    ax[j][1].plot(plot_norm2, color='#007291')
    ax[j][1].plot(rolling_avg(plot_norm2,3),color='#d8460b')
    ax[j][1].set_title("$\|\mathbf{p}_{k+1}- \mathbf{p}_k\|_2$")

    ## Infinite norme of the gradient
    ax[j][2].set_yscale("log")
    ax[j][2].plot(plot_norm, color='#007291')
    ax[j][2].plot(rolling_avg(plot_norm,3),color='#d8460b')
    ax[j][2].set_title("$\|\\nabla\mathcal{L}(\mathcal{R}_{\mathbf{p}_k})\|_\infty$")

    ## distance to the target
    ax[j][3].plot(plot_dist, color='#007291')
    ax[j][3].set_title("$\|\mathbf{p}_{k}- \mathbf{p}_{Target}\|_2$")
    ax[j][0].set_ylabel(name)
# plt.savefig('SGD_9_FL.pdf')
plt.show()

Synthetic Experiment 

In [None]:
#######################
### main experiment ###
#######################

def gD(net,training_set,target,initial,eps,N):
    plot_loss  = []
    plot_norm  = []
    plot_norm0 = []
    plot_dist  = []
    plot_grmin = []
    plot_grmax = []
    s = 0
    t = 0
    mean_loss = []
    A = True
    uu_max = 0
    for  (word, label) in training_set:
        
        label = torch.tensor([label],dtype=torch.float32)
        net.reset_h()
        for lettre in word:
            lettre = lettre.to(torch.float32)
            outputs = net(lettre).to(torch.float32)
        loss =  (-1)*( label*torch.log(outputs+10**(-7)) + (1-label)*torch.log((1-outputs)+10**(-7)) ) + torch.log(torch.tensor([1+10**(-7)]))  #criterion(outputs, label)
        mean_loss.append(loss.item())
        loss.backward()
        (norm,norm_zero,u_min,u_max,norm_st) = fc_step(net,target,initial,(1/N),False)
        reset_grad(net)
        s+=1
        if u_max < eps:
            t +=1
        if u_max > uu_max:
            uu_max = u_max
        uu_max = max(uu_max,u_max)
        plot_norm.append(norm)
        plot_norm0.append(norm_zero)
        plot_dist.append(norm_st)
        plot_grmin.append(u_min)
        plot_loss.append(loss.item())
        plot_grmax.append(u_max)
        
        A = not(t==len(training_set))
    return (plot_loss,plot_norm,plot_norm0,plot_dist,plot_grmin,plot_grmax,A)

def gD_persute(net,training_set,target,initial,eps,Lis):
    N = Lis[0]
    state = 0
    plot_loss  = []
    plot_norm  = []
    plot_norm0 = []
    plot_dist  = []
    plot_grmin = []
    plot_grmax = []
    A= True 
    while A:
        plot_loss_s,plot_norm_s,plot_norm0_s,plot_dist_s,plot_grmin_s,plot_grmax_s,A = gD(net,training_set,target,initial,eps,N)
        
        if A:
            print('A is true')
            plot_loss  += plot_loss_s
            plot_norm  += plot_norm_s
            plot_norm0 += plot_norm0_s
            plot_dist  += plot_dist_s
            plot_grmin += plot_grmin_s
            plot_grmax += plot_grmax_s
            fc_step(net,target,initial,(1/N),True)
        elif not(A) and N != Lis[-1]:
            print('A is not true')
            print(str(N))
            cf_step(net,target,initial,(1/N),True)
            state += 1
            N = Lis[state]
            A=True
        elif not(A) and N==Lis[-1]:
            print('We reached the linit of the gradient')
            dict = {}
            dict['loss'] = plot_loss
            dict['norm'] = plot_norm
            dict['norm0']= plot_norm0
            dict['dist'] = plot_dist
            dict['grmin']= plot_grmin
            dict['grmax']= plot_grmax
            return dict
            

net = SRN(4,2,1,num_layers=1, activation= 'sig').to(dtype = torch.float32)
initial = []
for params in net.parameters():
    initial.append(params.detach())

criterion = nn.CrossEntropyLoss()
lr = 0.01
eps = 10**(-14)

# initializing the target parameters
J = np.log(2)*128
T_U  = J*torch.tensor([[2,0],[0,2],[2,0],[0,2]])
T_Ub =  torch.tensor([0,0,0,0])
T_W  = -J*torch.tensor([[3,3,1,1],[1,1,3,3],[1,1,3,3],[3,3,1,1]])
T_Wb =  torch.tensor([0,0,0,0])
T_V  = J*torch.tensor([-1,-1,1,1]) 
T_Vb = torch.tensor([0])
target = [T_U,T_Ub,T_W,T_Wb,T_V,T_Vb]


dict = gD_persute(net,training_set,target,initial,eps,[10**1, 10**2, 10**3])
plot_loss = dict['loss']
plot_norm = dict['norm']
plot_norm0= dict['norm0']
plot_dist = dict['dist']
plot_grmin= dict['grmin']
plot_grmax= dict['grmax']


        
#########################
### backup experiment ###
#########################

def gD_part(net,training_set,target,initial,eps,B,N):
    plot_loss  = []
    plot_norm  = []
    plot_norm0 = []
    plot_dist  = []
    plot_grmin = []
    plot_grmax = []
    s = 0
    t = 0
    mean_loss = []
    A = True
    uu_max = 0
    for  (word, label) in training_set:
        
        label = torch.tensor([label],dtype=torch.float32)
        net.reset_h()
        for lettre in word:
            lettre = lettre.to(torch.float32)
            outputs = net(lettre).to(torch.float32)
        loss =  (-1)*( label*torch.log(outputs+10**(-7)) + (1-label)*torch.log((1-outputs)+10**(-7)) ) + torch.log(torch.tensor([1+10**(-7)]))  #criterion(outputs, label)
        mean_loss.append(loss.item())
        loss.backward()
        (norm,norm_zero,u_min,u_max,norm_st) = fc_partial_step(net,target,initial,(1/N),4,B,False) 
        reset_grad(net)
        s+=1
        if u_max < eps:
            t +=1

        plot_norm.append(norm)
        plot_norm0.append(norm_zero)
        plot_dist.append(norm_st)
        plot_grmin.append(u_min)
        plot_loss.append(loss.item())
        plot_grmax.append(u_max)
        
        A = not(t==len(training_set))
    return (plot_loss,plot_norm,plot_norm0,plot_dist,plot_grmin,plot_grmax,A)

def gD_persute_part(net,training_set,target,initial,eps,B,Lis):
    N = Lis[0]
    state = 0
    plot_loss  = []
    plot_norm  = []
    plot_norm0 = []
    plot_dist  = []
    plot_grmin = []
    plot_grmax = []
    A= True 
    count = 0
    while A:
        plot_loss_s,plot_norm_s,plot_norm0_s,plot_dist_s,plot_grmin_s,plot_grmax_s,A = gD_part(net,training_set,target,initial,eps,B,N)
        
        if A:
            print('A is true')
            plot_loss  += plot_loss_s
            plot_norm  += plot_norm_s
            plot_norm0 += plot_norm0_s
            plot_dist  += plot_dist_s
            plot_grmin += plot_grmin_s
            plot_grmax += plot_grmax_s
            if count <B:
                fc_partial_step(net,target,initial,(1/N),4,B,True)
                count+=1
            else:
                fc_partial_step(net,target,initial,(1/N),4,0,True)
        elif not(A) and N != Lis[-1]:
            print('A is not true')
            print(str(N))
            cf_partial_step(net,target,initial,(1/N),4,True)
            state += 1
            N = Lis[state]
            A=True
        elif not(A) and N==Lis[-1]:
            print('We reached the linit of the gradient')
            dict = {}
            dict['loss'] = plot_loss
            dict['norm'] = plot_norm
            dict['norm0']= plot_norm0
            dict['dist'] = plot_dist
            dict['grmin']= plot_grmin
            dict['grmax']= plot_grmax
            return dict

b_net = SRN(4,2,1,num_layers=1, activation= 'sig').to(dtype = torch.float32)
initial = []
for params in b_net.parameters():
    initial.append(params.detach())

criterion = nn.CrossEntropyLoss()
lr = 0.01
eps = 10**(-14)

J = np.log(2)*128
T_U  = J*torch.tensor([[2,0],[0,2],[2,0],[0,2]])
T_Ub =  torch.tensor([0,0,0,0])
T_W  = -J*torch.tensor([[3,3,1,1],[1,1,3,3],[1,1,3,3],[3,3,1,1]])
T_Wb =  torch.tensor([0,0,0,0])
T_V  = J*torch.tensor([-1,-1,1,1]) 
T_Vb = torch.tensor([0])
target = [T_U,T_Ub,T_W,T_Wb,T_V,T_Vb]


dict_part = gD_persute_part(b_net,training_set,target,initial,eps,1,[10**1, 10**2, 10**3])
part_plot_loss = dict_part['loss']
part_plot_norm = dict_part['norm']
part_plot_norm0= dict_part['norm0']
part_plot_dist = dict_part['dist']
part_plot_grmin= dict_part['grmin']
part_plot_grmax= dict_part['grmax']

Ploting the experiment statistics

In [None]:
fig, ax = plt.subplots(2, 4,figsize=(22,10))
K = 26
plt.rcParams.update({'font.size': 17})
fig.tight_layout(h_pad=4)
### main experiment

ax[0][0].set_yscale("log")
ax[0][0].plot(plot_loss, color='#007291')
ax[0][0].set_title("Loss", fontsize = K)

# ax[0][1].set_yscale("log")
ax[0][1].set_ylim(500,9*10**2)
ax[0][1].plot(plot_dist, color='#007291')
ax[0][1].set_title("$\|\\mathbf{p}_k - \\mathbf{p}_{Target}\|$", fontsize = K)

ax[0][2].set_yscale("log")
ax[0][2].plot(plot_grmax, color='#007291')
ax[0][2].set_title("$ \|\\nabla \mathcal{L}(\mathcal{R}_{\\mathbf{p}_k})\|_\infty $", fontsize = K)

labels, sizes = affected_params(net)

ax[0][3].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90, colors=["#f5c600", "#9b4923"])
### backup experiment

ax[1][0].set_yscale("log")
ax[1][0].set_ylim(10**(-2),2)
ax[1][0].plot(part_plot_loss,color='#d8460b')
ax[1][0].set_title("Loss", fontsize = K)

# ax[1][1].set_yscale("log")
ax[1][1].set_ylim(500,9*10**2)
ax[1][1].plot(part_plot_dist,color='#d8460b')
ax[1][1].set_title("$\|\\mathbf{p}_k - \\mathbf{p}_{Target}\|$", fontsize = K)

ax[1][2].set_yscale("log")
ax[1][2].plot(part_plot_grmax,color='#d8460b')
ax[1][2].set_title("$ \|\\nabla \mathcal{L}(\mathcal{R}_{\\mathbf{p}_k})\|_\infty $", fontsize = K)

labels, sizes = affected_params(b_net)

ax[1][3].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90, colors=["#f5c600", "#9b4923"])


# plt.savefig('limits_of_gd_these.pdf')
plt.show()