In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import copy
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
from torch.nn.utils import parameters_to_vector, vector_to_parameters

In [2]:
# Graph implementation
def generate_graph(cluster_sizes=[100,100], pin=0.5, pout=0.01, seed=0):
    """Generate a random connected graph"""
    probs = np.array([[pin, pout],[pout, pin]])
    while True:
        g = nx.stochastic_block_model(cluster_sizes, probs)
        if nx.algorithms.components.is_connected(g):
            return g


cluster_sizes = [10, 10]
pin = 0.5
pout = 0.01
seed = 0
alpha = 1e-3
lamda = 1e-3
eta = 1e-3
no_users = sum(cluster_sizes)
batch_size = 20
epochs = 1
it = 1000
G = generate_graph(cluster_sizes, pin, pout, seed)

#nx.draw(G, with_labels=True, node_size=100, alpha=1, linewidths=10)
#plt.show()

In [3]:
# Metropolis weights 
number_nodes = G.number_of_nodes()
weights = np.zeros([number_nodes, number_nodes])
for edge in G.edges():
  i, j = edge[0], edge[1]
  weights[i - 1][j - 1] = 1 / (1 + np.max([G.degree(i), G.degree(j)]))
  weights[j - 1][i - 1] = weights[i - 1][j - 1]

print(weights)

weights = weights + np.diag(1 - np.sum(weights, axis=0))

metropolis_weights = weights
print(metropolis_weights)


[[0.         0.16666667 0.         0.         0.2        0.125
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.16666667]
 [0.16666667 0.         0.         0.         0.16666667 0.125
  0.16666667 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.16666667]
 [0.         0.         0.         0.         0.         0.125
  0.         0.         0.14285714 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.125
  0.2        0.         0.14285714 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.16666667]
 [0.2        0.16666667 0.         0.         0.         0.
  0.         0.2        0.14285714 0.         0.         0.
  0.         0.         0.         0.         0.

In [4]:
def load_dataset():
    transforms_mnist = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,),(0.3081,))])
    mnist_data_train = datasets.MNIST('./data/mnist', train=True, download=True, transform=transforms_mnist)
    mnist_data_test = datasets.MNIST('./data/mnist', train=False, download=True, transform=transforms_mnist)

    return mnist_data_train, mnist_data_test

In [5]:
def degrees(A):
    """Return the degrees of each node of a graph from its adjacency matrix"""
    return np.sum(A, axis=0).reshape(A.shape[0], 1)

def node_degree(n, G):
    cnt = 0
    for i in G.neighbors(n):
        cnt += 1
    return cnt

def get_neighbors(n, G):
    neighbors_list = []
    for i in G.neighbors(n):
        neighbors_list.append(int(i))
    return neighbors_list

In [6]:
datapoints = {}
count = 0
W1 = np.array([2, 2])
W2 = np.array([-2, 2])
W = [W1, W2]
m = 200
n = 2
noise_sd = 0.001
for i, cluster_size in enumerate(cluster_sizes):
    for j in range(cluster_size):
        features = np.random.normal(loc=0.0, scale=1.0, size=(m, n))
        label = np.dot(features, W[i]) + np.random.normal(0,noise_sd)
        datapoints[count] = {
                'features': features,
                'degree': node_degree(count, G),
                'label': label,
                'neighbors': get_neighbors(count, G)
            }
        count += 1

In [7]:
class MyDataset(Dataset):
    def __init__(self, data, targets, transform=None):
        self.data = torch.FloatTensor(data)
        self.targets = torch.FloatTensor(targets).unsqueeze(-1)
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]

        return x, y
    
    def __len__(self):
        return len(self.data)


In [8]:
class MLP_Net(nn.Module):
    def __init__(self, user_id):
        super(MLP_Net, self).__init__()
        self.fc1 = nn.Linear(2, 1, bias=False)
        #self.fc2 = nn.Linear(4, 1, bias=False)
        #self.fc3 = nn.Linear(200, 10)
        self.user_id = user_id

    def forward(self, x):
        x = torch.flatten(x, 1)
        #x = F.relu(self.fc1(x))
        output = self.fc1(x)
        #output = self.fc3(x)
        return output

In [9]:
from typing import Iterable, Optional

def grads_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor:
    r"""Convert parameters to one vector

    Args:
        parameters (Iterable[Tensor]): an iterator of Tensors that are the
            parameters of a model.

    Returns:
        The parameters represented by a single vector
    """
    # Flag for the device where the parameter is located
    param_device = None

    vec = []
    for param in parameters:
        # Ensure the parameters are located in the same device
        param_device = param.grad

        vec.append(param_device.view(-1))
    return torch.cat(vec)

In [10]:
model = MLP_Net(user_id=0)

lr = 0.01

dataloader = DataLoader(MyDataset(datapoints[19]["features"], datapoints[19]["label"]), batch_size=50, shuffle=False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for i in range(100):
    for (x, y) in dataloader:
        criterion = nn.MSELoss()
        optimizer.zero_grad()
        yhat = model(x)
        print(y.size())
        print(yhat.size())
        loss = criterion(yhat, y)
        
        loss.backward()
        print(i, loss, grads_to_vector(model.parameters()), parameters_to_vector(model.parameters()))
        #optimizer.step()
        new_model = parameters_to_vector(model.parameters()) - lr * grads_to_vector(model.parameters())
        vector_to_parameters(parameters=model.parameters(), vec=new_model)
        #if i % 50 ==0:
            #lr *= 0.9
            

#parameters_to_vector(model.parameters())

torch.Size([50, 1])
torch.Size([50, 1])
0 tensor(2.1245, grad_fn=<MseLossBackward0>) tensor([ 1.4364, -1.5385]) tensor([-0.6163,  0.5302], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
0 tensor(3.6712, grad_fn=<MseLossBackward0>) tensor([ 2.8876, -2.3296]) tensor([-0.6306,  0.5456], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
0 tensor(5.6275, grad_fn=<MseLossBackward0>) tensor([ 3.5589, -4.5308]) tensor([-0.6595,  0.5689], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
0 tensor(2.9868, grad_fn=<MseLossBackward0>) tensor([ 2.1828, -2.2554]) tensor([-0.6951,  0.6142], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
1 tensor(1.8272, grad_fn=<MseLossBackward0>) tensor([ 1.3317, -1.4271]) tensor([-0.7169,  0.6368], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
1 tensor(3.1573, grad_fn=<MseLossBackward0>) tensor([ 2.6775, -2.1608]) tensor([-0.7303,  0.6510], grad_fn=<CatBackward0>)
torch.Size([50, 1])
to

torch.Size([50, 1])
torch.Size([50, 1])
13 tensor(0.5169, grad_fn=<MseLossBackward0>) tensor([ 1.0816, -0.8760]) tensor([-1.4869,  1.4532], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
13 tensor(0.7931, grad_fn=<MseLossBackward0>) tensor([ 1.3342, -1.7026]) tensor([-1.4978,  1.4620], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
13 tensor(0.4207, grad_fn=<MseLossBackward0>) tensor([ 0.8172, -0.8483]) tensor([-1.5111,  1.4790], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
14 tensor(0.2574, grad_fn=<MseLossBackward0>) tensor([ 0.4984, -0.5371]) tensor([-1.5193,  1.4875], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
14 tensor(0.4446, grad_fn=<MseLossBackward0>) tensor([ 1.0029, -0.8126]) tensor([-1.5243,  1.4929], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
14 tensor(0.6821, grad_fn=<MseLossBackward0>) tensor([ 1.2372, -1.5792]) tensor([-1.5343,  1.5010], grad_fn=<CatBackward0>)
torch.Size([50, 

28 tensor(0.0438, grad_fn=<MseLossBackward0>) tensor([ 0.2632, -0.2743]) tensor([-1.8425,  1.8316], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
29 tensor(0.0268, grad_fn=<MseLossBackward0>) tensor([ 0.1605, -0.1738]) tensor([-1.8451,  1.8343], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
29 tensor(0.0463, grad_fn=<MseLossBackward0>) tensor([ 0.3230, -0.2629]) tensor([-1.8467,  1.8360], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
29 tensor(0.0711, grad_fn=<MseLossBackward0>) tensor([ 0.3989, -0.5105]) tensor([-1.8499,  1.8387], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
29 tensor(0.0377, grad_fn=<MseLossBackward0>) tensor([ 0.2440, -0.2544]) tensor([-1.8539,  1.8438], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
30 tensor(0.0231, grad_fn=<MseLossBackward0>) tensor([ 0.1488, -0.1612]) tensor([-1.8564,  1.8463], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
30 tensor(0.0398

49 tensor(0.0013, grad_fn=<MseLossBackward0>) tensor([ 0.0355, -0.0386]) tensor([-1.9658,  1.9632], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
49 tensor(0.0023, grad_fn=<MseLossBackward0>) tensor([ 0.0712, -0.0584]) tensor([-1.9661,  1.9636], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
49 tensor(0.0035, grad_fn=<MseLossBackward0>) tensor([ 0.0882, -0.1134]) tensor([-1.9668,  1.9642], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
49 tensor(0.0018, grad_fn=<MseLossBackward0>) tensor([ 0.0539, -0.0563]) tensor([-1.9677,  1.9653], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
50 tensor(0.0011, grad_fn=<MseLossBackward0>) tensor([ 0.0329, -0.0358]) tensor([-1.9682,  1.9659], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
50 tensor(0.0020, grad_fn=<MseLossBackward0>) tensor([ 0.0660, -0.0542]) tensor([-1.9686,  1.9663], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
50 tensor(0.0030

66 tensor(0.0001, grad_fn=<MseLossBackward0>) tensor([ 0.0099, -0.0107]) tensor([-1.9905,  1.9898], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
66 tensor(0.0002, grad_fn=<MseLossBackward0>) tensor([ 0.0196, -0.0164]) tensor([-1.9906,  1.9899], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
66 tensor(0.0003, grad_fn=<MseLossBackward0>) tensor([ 0.0244, -0.0316]) tensor([-1.9908,  1.9901], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
66 tensor(0.0001, grad_fn=<MseLossBackward0>) tensor([ 0.0150, -0.0154]) tensor([-1.9910,  1.9904], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
67 tensor(8.8728e-05, grad_fn=<MseLossBackward0>) tensor([ 0.0092, -0.0099]) tensor([-1.9912,  1.9905], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
67 tensor(0.0002, grad_fn=<MseLossBackward0>) tensor([ 0.0182, -0.0152]) tensor([-1.9913,  1.9906], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
67 tensor(0.

84 tensor(7.6455e-06, grad_fn=<MseLossBackward0>) tensor([ 0.0027, -0.0027]) tensor([-1.9975,  1.9974], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
84 tensor(1.2200e-05, grad_fn=<MseLossBackward0>) tensor([ 0.0049, -0.0043]) tensor([-1.9976,  1.9974], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
84 tensor(1.8910e-05, grad_fn=<MseLossBackward0>) tensor([ 0.0062, -0.0083]) tensor([-1.9976,  1.9974], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
84 tensor(9.6200e-06, grad_fn=<MseLossBackward0>) tensor([ 0.0039, -0.0038]) tensor([-1.9977,  1.9975], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
85 tensor(6.6884e-06, grad_fn=<MseLossBackward0>) tensor([ 0.0025, -0.0025]) tensor([-1.9977,  1.9976], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([50, 1])
85 tensor(1.0583e-05, grad_fn=<MseLossBackward0>) tensor([ 0.0045, -0.0040]) tensor([-1.9977,  1.9976], grad_fn=<CatBackward0>)
torch.Size([50, 1])
torch.Size([

In [11]:
parameters_to_vector(model.parameters())

tensor([-1.9993,  1.9992], grad_fn=<CatBackward0>)

In [12]:
class CNN_Net(nn.Module):
    def __init__(self):
        super(CNN_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        self.pool = nn.MaxPool2d(2,2)
        self.dropout = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        output = self.fc2(x)
        return output

In [13]:
class ClientUpdate(object):
    def __init__(self, dataset, batchSize, alpha, lamda, epochs, projection_list, projected_weights):
        self.train_loader = DataLoader(MyDataset(dataset["features"], dataset["label"]), batch_size=batchSize, shuffle=True)
        #self.learning_rate = learning_rate
        self.epochs = epochs
        self.batchSize = batchSize

    def train(self, model):
        criterion = nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.5)

        e_loss = []
        for epoch in range(1, self.epochs+1):
            train_loss = 0
            model.train()
            for i, (data, labels) in zip(range(1), self.train_loader):
                data, labels = data, labels
                optimizer.zero_grad() 
                output = model(data)  
                loss = criterion(output, labels)
                #loss += mu/2 * torch.norm(client_param.data - server_param.data)**2
                loss.backward()
                grads = grads_to_vector(model.parameters())
                #optimizer.step()
                train_loss += loss.item()*data.size(0)
                weights = parameters_to_vector(model.parameters())
                mat_vec_sum = torch.zeros_like(weights)
                for j in G.neighbors(model.user_id):
                    mat_vec_sum = torch.add(mat_vec_sum, torch.matmul(torch.transpose(projection_list[model.user_id][j], 0, 1), 
                                                         projected_weights[j][model.user_id] - projected_weights[model.user_id][j]))
                
                model_update = parameters_to_vector(model.parameters()) - alpha * (grads + lamda * mat_vec_sum)
                
            vector_to_parameters(parameters=model.parameters(), vec=model_update)
                

            train_loss = train_loss/self.batchSize#len(self.train_loader.dataset) 
            e_loss.append(train_loss)

        total_loss = e_loss#sum(e_loss)/len(e_loss)

        return model.state_dict(), total_loss

In [14]:
# Preparing projection matrices
models = [MLP_Net(user_id=i) for i in range(no_users)]
#temp = MLP_Net()
projection_list = []
projected_weights = []

def update_ProjWeight(projection_list, projected_weights, first_run=True):
    #projected_weights = []
    for i in range(no_users):
        neighbors_mat = []
        neighbors_weights = []
        for j in range(no_users):
            if j in G.neighbors(i):
                with torch.no_grad():
                    if first_run == True:
                        row, column = parameters_to_vector(models[j].parameters()).size()[0], parameters_to_vector(models[i].parameters()).size()[0]
                        mat = torch.zeros((row, column))
                        mat.fill_diagonal_(1.0)
                        neighbors_mat.append(mat)
                        neighbors_weights.append(torch.matmul(mat, parameters_to_vector(models[j].parameters())))
                    else:
                        neighbors_weights.append(torch.matmul(projection_list[j][i], parameters_to_vector(models[j].parameters())))
            else:
                neighbors_mat.append(0)
                neighbors_weights.append(0)
        if first_run == True:
            projection_list.append(neighbors_mat)
        projected_weights.append(neighbors_weights)

update_ProjWeight(projection_list, projected_weights)



In [15]:
def testing(model, dataset, bs, criterion): 
    test_loss = 0
    correct = 0
    test_loader = DataLoader(MyDataset(dataset["features"], dataset["label"]), batch_size=bs)
    l = len(test_loader)
    model.eval()
    for data, labels in test_loader:
        data, labels = data, labels
        output = model(data)
        loss = criterion(output, labels)
        test_loss += loss.item()*data.size(0)
        #_, pred = torch.max(output, 1)
        #correct += pred.eq(labels.data.view_as(pred)).sum().item()
    
    test_loss /= len(test_loader.dataset)
    
    return test_loss

In [16]:
model = MLP_Net(user_id=0)

from torch.nn.utils import parameters_to_vector, vector_to_parameters

with torch.no_grad():    
    params = parameters_to_vector(model.parameters())

    print(params)

params *= 2.

vector_to_parameters(parameters=model.parameters(), vec=params)

parameters_to_vector(model.parameters())





tensor([0.3239, 0.5274])


tensor([0.6478, 1.0547], grad_fn=<CatBackward0>)

In [17]:
#global_model = CNN_Net().cuda()
models = [MLP_Net(user_id=i) for i in range(no_users)]
dummy_models = [MLP_Net(user_id=i) for i in range(no_users)]

#model.load_state_dict(global_model.state_dict())

criterion = nn.MSELoss()


train_loss = []
test_loss = []
test_accuracy = []


for curr_round in tqdm(range(1, it+1)):
    w, local_loss = [], []

    
    for i in range(no_users):
        dummy_models[i].load_state_dict(models[i].state_dict())
        local_update = ClientUpdate(dataset=datapoints[i], batchSize=batch_size, alpha=alpha, lamda=lamda, epochs=1, projection_list=projection_list, projected_weights=projected_weights)
        weights, loss = local_update.train(dummy_models[i])
        w.append(weights)
        local_loss.append(loss)
        models[i].load_state_dict(w[i])
        
    
    
    # Update prjection matrix
    
    #print(projection_list[0], projected_weights[0])
    
    for i in range(no_users):
        weights = parameters_to_vector(models[i].parameters())
        for j in G.neighbors(i):
            weights = parameters_to_vector(model.parameters())
            mat_vec_sum = torch.zeros_like(weights)
            for k in G.neighbors(i):
                 mat_vec_sum = torch.add(mat_vec_sum, torch.matmul(projected_weights[k][i] - projected_weights[i][k],
                                                                  torch.transpose(weights, -1, 0)))
            projection_list[i][j] = torch.add(projection_list[i][j], -1 * eta * lamda * mat_vec_sum)
                                         
    projected_weights = []                                          
    update_ProjWeight(projection_list, projected_weights, first_run=False)
        
        
        
    
    




          
            

    local_test_acc = []
    local_test_loss = []
    for k in range(no_users):
      
      g_loss = testing(models[i], datapoints[i], 50, criterion)
      local_test_loss.append(g_loss)
    
        

    g_loss = sum(local_test_loss) / len(local_test_loss)
    #g_accuracy = sum(local_test_acc) / len(local_test_acc)
    
    

    test_loss.append(g_loss)
    #test_accuracy.append(g_accuracy)
    print("Training_loss %2.5f"% (test_loss[-1]))

  0%|          | 1/1000 [00:00<01:50,  9.03it/s]

Training_loss 7.66938


  0%|          | 2/1000 [00:00<02:04,  7.99it/s]

Training_loss 7.63382


  0%|          | 4/1000 [00:00<01:35, 10.44it/s]

Training_loss 7.58794
Training_loss 7.54415
Training_loss 7.51700


  1%|          | 6/1000 [00:00<01:37, 10.17it/s]

Training_loss 7.48543
Training_loss 7.44289


  1%|          | 8/1000 [00:00<01:32, 10.73it/s]

Training_loss 7.41001
Training_loss 7.39108


  1%|          | 10/1000 [00:00<01:39, 10.00it/s]

Training_loss 7.35850
Training_loss 7.32449


  1%|          | 12/1000 [00:01<01:34, 10.45it/s]

Training_loss 7.28578


  1%|▏         | 14/1000 [00:01<01:31, 10.74it/s]

Training_loss 7.27153
Training_loss 7.25119
Training_loss 7.22573


  2%|▏         | 16/1000 [00:01<01:37, 10.11it/s]

Training_loss 7.19036
Training_loss 7.15395


  2%|▏         | 18/1000 [00:01<01:35, 10.33it/s]

Training_loss 7.10490


  2%|▏         | 20/1000 [00:01<01:30, 10.79it/s]

Training_loss 7.07572
Training_loss 7.04940


  2%|▏         | 22/1000 [00:02<01:46,  9.21it/s]

Training_loss 7.03318
Training_loss 7.00368


  2%|▏         | 24/1000 [00:02<02:12,  7.38it/s]

Training_loss 6.97759
Training_loss 6.95628


  2%|▎         | 25/1000 [00:02<02:27,  6.63it/s]

Training_loss 6.93860


  3%|▎         | 27/1000 [00:03<02:45,  5.89it/s]

Training_loss 6.92376
Training_loss 6.88579


  3%|▎         | 29/1000 [00:03<02:14,  7.21it/s]

Training_loss 6.86636
Training_loss 6.82787


  3%|▎         | 31/1000 [00:03<02:17,  7.06it/s]

Training_loss 6.79167
Training_loss 6.76155
Training_loss 6.74735


  4%|▎         | 35/1000 [00:04<01:40,  9.62it/s]

Training_loss 6.71490
Training_loss 6.69961
Training_loss 6.67529


  4%|▎         | 37/1000 [00:04<01:48,  8.89it/s]

Training_loss 6.63101
Training_loss 6.61465


  4%|▍         | 40/1000 [00:04<01:46,  9.00it/s]

Training_loss 6.59375
Training_loss 6.57700
Training_loss 6.55431


  4%|▍         | 42/1000 [00:04<01:59,  8.01it/s]

Training_loss 6.53114
Training_loss 6.51237


  4%|▍         | 45/1000 [00:05<01:45,  9.05it/s]

Training_loss 6.49960
Training_loss 6.47463
Training_loss 6.45845


  5%|▍         | 47/1000 [00:05<02:04,  7.63it/s]

Training_loss 6.43327
Training_loss 6.41988


  5%|▌         | 50/1000 [00:05<01:46,  8.95it/s]

Training_loss 6.38324
Training_loss 6.34597
Training_loss 6.33319


  5%|▌         | 52/1000 [00:05<01:31, 10.31it/s]

Training_loss 6.31291
Training_loss 6.30274
Training_loss 6.27738


  6%|▌         | 56/1000 [00:06<01:22, 11.43it/s]

Training_loss 6.25458
Training_loss 6.23749
Training_loss 6.20079


  6%|▌         | 58/1000 [00:06<01:21, 11.61it/s]

Training_loss 6.18184
Training_loss 6.15995
Training_loss 6.14081


  6%|▌         | 62/1000 [00:06<01:19, 11.81it/s]

Training_loss 6.11848
Training_loss 6.09975
Training_loss 6.08842


  6%|▋         | 64/1000 [00:06<01:17, 12.02it/s]

Training_loss 6.05299
Training_loss 6.03839
Training_loss 6.01895


  7%|▋         | 68/1000 [00:07<01:12, 12.81it/s]

Training_loss 5.99520
Training_loss 5.96377
Training_loss 5.92494


  7%|▋         | 70/1000 [00:07<01:23, 11.20it/s]

Training_loss 5.90830
Training_loss 5.88502


  7%|▋         | 72/1000 [00:07<01:19, 11.66it/s]

Training_loss 5.86818
Training_loss 5.86060
Training_loss 5.84697


  8%|▊         | 76/1000 [00:08<01:26, 10.64it/s]

Training_loss 5.81102
Training_loss 5.79529
Training_loss 5.77081


  8%|▊         | 78/1000 [00:08<01:22, 11.19it/s]

Training_loss 5.75656
Training_loss 5.72931
Training_loss 5.72049


  8%|▊         | 82/1000 [00:08<01:18, 11.63it/s]

Training_loss 5.69106
Training_loss 5.67608
Training_loss 5.65470


  8%|▊         | 84/1000 [00:08<01:14, 12.34it/s]

Training_loss 5.63342
Training_loss 5.60888
Training_loss 5.58936


  9%|▉         | 88/1000 [00:08<01:12, 12.52it/s]

Training_loss 5.55627
Training_loss 5.54420
Training_loss 5.52328


  9%|▉         | 90/1000 [00:09<01:13, 12.36it/s]

Training_loss 5.50934
Training_loss 5.48127
Training_loss 5.44762


  9%|▉         | 94/1000 [00:09<01:09, 13.06it/s]

Training_loss 5.42876
Training_loss 5.39388
Training_loss 5.36778


 10%|▉         | 96/1000 [00:09<01:15, 12.01it/s]

Training_loss 5.35043
Training_loss 5.32617
Training_loss 5.30910


 10%|█         | 100/1000 [00:09<01:14, 12.14it/s]

Training_loss 5.27313
Training_loss 5.25621
Training_loss 5.24240


 10%|█         | 102/1000 [00:10<01:13, 12.21it/s]

Training_loss 5.22229
Training_loss 5.19939
Training_loss 5.17631


 11%|█         | 106/1000 [00:10<01:09, 12.81it/s]

Training_loss 5.14540
Training_loss 5.11177
Training_loss 5.10116


 11%|█         | 108/1000 [00:10<01:11, 12.42it/s]

Training_loss 5.08887
Training_loss 5.06665
Training_loss 5.05402


 11%|█         | 112/1000 [00:10<01:14, 11.88it/s]

Training_loss 5.04481
Training_loss 5.01658
Training_loss 5.00831


 11%|█▏        | 114/1000 [00:11<01:10, 12.59it/s]

Training_loss 4.99054
Training_loss 4.97097
Training_loss 4.96004


 12%|█▏        | 118/1000 [00:11<01:09, 12.65it/s]

Training_loss 4.94527
Training_loss 4.93555
Training_loss 4.91050


 12%|█▏        | 120/1000 [00:11<01:10, 12.41it/s]

Training_loss 4.88593
Training_loss 4.87183


 12%|█▏        | 122/1000 [00:11<01:22, 10.70it/s]

Training_loss 4.86356
Training_loss 4.83620
Training_loss 4.81506


 12%|█▏        | 124/1000 [00:11<01:19, 11.00it/s]

Training_loss 4.80220
Training_loss 4.78663
Training_loss 4.77673


 13%|█▎        | 128/1000 [00:12<01:31,  9.51it/s]

Training_loss 4.75759
Training_loss 4.74285


 13%|█▎        | 130/1000 [00:12<01:38,  8.83it/s]

Training_loss 4.73096
Training_loss 4.71262


 13%|█▎        | 132/1000 [00:12<01:38,  8.83it/s]

Training_loss 4.69023
Training_loss 4.67720


 13%|█▎        | 134/1000 [00:13<01:39,  8.71it/s]

Training_loss 4.66408
Training_loss 4.63458


 14%|█▎        | 136/1000 [00:13<02:10,  6.64it/s]

Training_loss 4.61796
Training_loss 4.59683


 14%|█▍        | 138/1000 [00:13<01:54,  7.51it/s]

Training_loss 4.57515
Training_loss 4.55787


 14%|█▍        | 140/1000 [00:14<01:44,  8.20it/s]

Training_loss 4.53684
Training_loss 4.52114
Training_loss 4.50612


 14%|█▍        | 144/1000 [00:14<01:25, 10.01it/s]

Training_loss 4.48499
Training_loss 4.46352
Training_loss 4.43714


 15%|█▍        | 146/1000 [00:14<01:18, 10.85it/s]

Training_loss 4.41981
Training_loss 4.40316
Training_loss 4.37533


 15%|█▌        | 150/1000 [00:14<01:14, 11.49it/s]

Training_loss 4.35664
Training_loss 4.33814
Training_loss 4.32648


 15%|█▌        | 152/1000 [00:15<01:13, 11.58it/s]

Training_loss 4.30788
Training_loss 4.28676
Training_loss 4.27367


 16%|█▌        | 156/1000 [00:15<01:12, 11.56it/s]

Training_loss 4.25098
Training_loss 4.24071
Training_loss 4.22964


 16%|█▌        | 158/1000 [00:15<01:10, 11.96it/s]

Training_loss 4.21681
Training_loss 4.20414
Training_loss 4.18714


 16%|█▌        | 162/1000 [00:15<01:06, 12.65it/s]

Training_loss 4.16645
Training_loss 4.15028
Training_loss 4.13474
Training_loss 4.12154


 17%|█▋        | 166/1000 [00:16<01:05, 12.66it/s]

Training_loss 4.10904
Training_loss 4.09824
Training_loss 4.08547


 17%|█▋        | 168/1000 [00:16<01:07, 12.39it/s]

Training_loss 4.07086
Training_loss 4.05886
Training_loss 4.04720


 17%|█▋        | 170/1000 [00:16<01:16, 10.91it/s]

Training_loss 4.03238
Training_loss 4.01539


 17%|█▋        | 172/1000 [00:16<01:24,  9.77it/s]

Training_loss 3.99536
Training_loss 3.97440


 18%|█▊        | 175/1000 [00:17<01:30,  9.07it/s]

Training_loss 3.95770
Training_loss 3.93635


 18%|█▊        | 177/1000 [00:17<01:27,  9.43it/s]

Training_loss 3.92177
Training_loss 3.91010
Training_loss 3.88750


 18%|█▊        | 181/1000 [00:17<01:15, 10.84it/s]

Training_loss 3.86505
Training_loss 3.85583
Training_loss 3.84256


 18%|█▊        | 183/1000 [00:17<01:15, 10.78it/s]

Training_loss 3.82641
Training_loss 3.82158
Training_loss 3.80880


 19%|█▊        | 187/1000 [00:18<01:11, 11.39it/s]

Training_loss 3.79915
Training_loss 3.78885
Training_loss 3.77756


 19%|█▉        | 189/1000 [00:18<01:09, 11.66it/s]

Training_loss 3.76092
Training_loss 3.74438
Training_loss 3.72641


 19%|█▉        | 193/1000 [00:18<01:05, 12.31it/s]

Training_loss 3.71424
Training_loss 3.69866
Training_loss 3.68808


 20%|█▉        | 195/1000 [00:18<01:04, 12.45it/s]

Training_loss 3.67606
Training_loss 3.65650
Training_loss 3.65189


 20%|█▉        | 199/1000 [00:19<01:02, 12.87it/s]

Training_loss 3.63818
Training_loss 3.63249
Training_loss 3.62254


 20%|██        | 201/1000 [00:19<01:06, 11.96it/s]

Training_loss 3.61113
Training_loss 3.59112
Training_loss 3.57676


 20%|██        | 205/1000 [00:19<01:12, 10.92it/s]

Training_loss 3.56336
Training_loss 3.55792
Training_loss 3.54122


 21%|██        | 207/1000 [00:19<01:18, 10.11it/s]

Training_loss 3.53361
Training_loss 3.51646


 21%|██        | 209/1000 [00:20<01:18, 10.03it/s]

Training_loss 3.50720
Training_loss 3.49791
Training_loss 3.48268


 21%|██▏       | 213/1000 [00:20<01:13, 10.67it/s]

Training_loss 3.47607
Training_loss 3.47013
Training_loss 3.45952


 22%|██▏       | 215/1000 [00:20<01:17, 10.15it/s]

Training_loss 3.44835
Training_loss 3.43226
Training_loss 3.41945


 22%|██▏       | 218/1000 [00:21<01:20,  9.70it/s]

Training_loss 3.39657
Training_loss 3.38300


 22%|██▏       | 220/1000 [00:21<01:22,  9.46it/s]

Training_loss 3.36250
Training_loss 3.34240


 22%|██▏       | 222/1000 [00:21<01:32,  8.41it/s]

Training_loss 3.33057
Training_loss 3.31619


 22%|██▏       | 224/1000 [00:21<01:31,  8.44it/s]

Training_loss 3.29098
Training_loss 3.27809


 23%|██▎       | 227/1000 [00:22<01:22,  9.40it/s]

Training_loss 3.26271
Training_loss 3.25484
Training_loss 3.22771


 23%|██▎       | 229/1000 [00:22<01:13, 10.44it/s]

Training_loss 3.21084
Training_loss 3.19650
Training_loss 3.18246


 23%|██▎       | 231/1000 [00:22<01:13, 10.52it/s]

Training_loss 3.17197
Training_loss 3.15198


 23%|██▎       | 234/1000 [00:22<01:21,  9.39it/s]

Training_loss 3.14099
Training_loss 3.13206


 24%|██▎       | 236/1000 [00:23<01:22,  9.23it/s]

Training_loss 3.12641
Training_loss 3.11834


 24%|██▍       | 238/1000 [00:23<01:24,  9.05it/s]

Training_loss 3.10728
Training_loss 3.09099


 24%|██▍       | 240/1000 [00:23<01:28,  8.63it/s]

Training_loss 3.07769
Training_loss 3.06821


 24%|██▍       | 242/1000 [00:23<01:27,  8.68it/s]

Training_loss 3.05714
Training_loss 3.04636


 24%|██▍       | 244/1000 [00:23<01:32,  8.21it/s]

Training_loss 3.03169
Training_loss 3.02164


 25%|██▍       | 246/1000 [00:24<01:21,  9.28it/s]

Training_loss 3.01513
Training_loss 3.01107


 25%|██▍       | 248/1000 [00:24<01:22,  9.11it/s]

Training_loss 3.00141
Training_loss 2.99333
Training_loss 2.98335


 25%|██▌       | 252/1000 [00:24<01:07, 11.08it/s]

Training_loss 2.97527
Training_loss 2.95707
Training_loss 2.94873


 25%|██▌       | 254/1000 [00:24<01:03, 11.69it/s]

Training_loss 2.93607
Training_loss 2.92476
Training_loss 2.90634


 26%|██▌       | 258/1000 [00:25<01:01, 12.13it/s]

Training_loss 2.89547
Training_loss 2.88008
Training_loss 2.87087


 26%|██▌       | 260/1000 [00:25<01:00, 12.17it/s]

Training_loss 2.85891
Training_loss 2.85052
Training_loss 2.83475


 26%|██▋       | 264/1000 [00:25<01:04, 11.38it/s]

Training_loss 2.83008
Training_loss 2.82046
Training_loss 2.81078


 27%|██▋       | 266/1000 [00:25<01:08, 10.77it/s]

Training_loss 2.80116
Training_loss 2.79517


 27%|██▋       | 268/1000 [00:26<01:07, 10.91it/s]

Training_loss 2.78923
Training_loss 2.77781
Training_loss 2.76956


 27%|██▋       | 272/1000 [00:26<01:03, 11.44it/s]

Training_loss 2.75753
Training_loss 2.75053
Training_loss 2.73787


 27%|██▋       | 274/1000 [00:26<01:02, 11.70it/s]

Training_loss 2.72248
Training_loss 2.71355
Training_loss 2.69260


 28%|██▊       | 278/1000 [00:26<01:01, 11.84it/s]

Training_loss 2.68225
Training_loss 2.67643
Training_loss 2.66979


 28%|██▊       | 280/1000 [00:27<01:03, 11.35it/s]

Training_loss 2.66196
Training_loss 2.65410
Training_loss 2.64426


 28%|██▊       | 282/1000 [00:27<01:20,  8.95it/s]

Training_loss 2.63399


 28%|██▊       | 284/1000 [00:27<01:33,  7.65it/s]

Training_loss 2.62316
Training_loss 2.61589


 29%|██▊       | 286/1000 [00:28<01:33,  7.66it/s]

Training_loss 2.60538
Training_loss 2.59149


 29%|██▉       | 288/1000 [00:28<01:30,  7.89it/s]

Training_loss 2.58023
Training_loss 2.57541


 29%|██▉       | 291/1000 [00:28<01:15,  9.36it/s]

Training_loss 2.56280
Training_loss 2.54453
Training_loss 2.53252


 29%|██▉       | 293/1000 [00:28<01:31,  7.74it/s]

Training_loss 2.52093
Training_loss 2.51105


 30%|██▉       | 296/1000 [00:29<01:18,  8.91it/s]

Training_loss 2.50014
Training_loss 2.49014
Training_loss 2.48258


 30%|██▉       | 298/1000 [00:29<01:21,  8.57it/s]

Training_loss 2.47039
Training_loss 2.45793


 30%|███       | 300/1000 [00:29<01:22,  8.44it/s]

Training_loss 2.44527
Training_loss 2.43964
Training_loss 2.42595


 30%|███       | 303/1000 [00:30<01:22,  8.46it/s]

Training_loss 2.42211
Training_loss 2.41707


 31%|███       | 306/1000 [00:30<01:09, 10.00it/s]

Training_loss 2.40942
Training_loss 2.40263
Training_loss 2.39337


 31%|███       | 308/1000 [00:30<01:03, 10.93it/s]

Training_loss 2.38424
Training_loss 2.37675
Training_loss 2.36727


 31%|███       | 310/1000 [00:30<01:05, 10.57it/s]

Training_loss 2.36093
Training_loss 2.35098


 31%|███▏      | 314/1000 [00:31<01:07, 10.12it/s]

Training_loss 2.33639
Training_loss 2.32690
Training_loss 2.32194


 32%|███▏      | 318/1000 [00:31<00:58, 11.63it/s]

Training_loss 2.31622
Training_loss 2.30691
Training_loss 2.30037
Training_loss 2.29547


 32%|███▏      | 320/1000 [00:31<00:59, 11.36it/s]

Training_loss 2.28384
Training_loss 2.27795


 32%|███▏      | 322/1000 [00:31<01:03, 10.61it/s]

Training_loss 2.27129
Training_loss 2.26496
Training_loss 2.25769


 33%|███▎      | 326/1000 [00:32<00:59, 11.24it/s]

Training_loss 2.25004
Training_loss 2.23576
Training_loss 2.22798


 33%|███▎      | 328/1000 [00:32<01:01, 10.84it/s]

Training_loss 2.22396
Training_loss 2.21475


 33%|███▎      | 330/1000 [00:32<01:05, 10.24it/s]

Training_loss 2.20142
Training_loss 2.19654
Training_loss 2.18889


 33%|███▎      | 334/1000 [00:32<00:59, 11.14it/s]

Training_loss 2.18079
Training_loss 2.16906
Training_loss 2.16267
Training_loss 2.15716


 34%|███▎      | 337/1000 [00:33<01:14,  8.93it/s]

Training_loss 2.15123
Training_loss 2.14498


 34%|███▍      | 339/1000 [00:33<01:08,  9.67it/s]

Training_loss 2.14209
Training_loss 2.13630
Training_loss 2.13022


 34%|███▍      | 343/1000 [00:33<00:59, 11.05it/s]

Training_loss 2.12440
Training_loss 2.11831
Training_loss 2.11120


 34%|███▍      | 345/1000 [00:34<00:57, 11.46it/s]

Training_loss 2.09956
Training_loss 2.09132
Training_loss 2.08569


 35%|███▍      | 349/1000 [00:34<00:59, 11.03it/s]

Training_loss 2.07957
Training_loss 2.07263
Training_loss 2.06808


 35%|███▌      | 351/1000 [00:34<01:02, 10.33it/s]

Training_loss 2.05878
Training_loss 2.05355


 35%|███▌      | 353/1000 [00:34<01:03, 10.18it/s]

Training_loss 2.04413
Training_loss 2.03827
Training_loss 2.03184


 36%|███▌      | 357/1000 [00:35<01:00, 10.60it/s]

Training_loss 2.02165
Training_loss 2.01782
Training_loss 2.00447


 36%|███▌      | 359/1000 [00:35<00:58, 10.89it/s]

Training_loss 1.99367
Training_loss 1.98783
Training_loss 1.97739


 36%|███▋      | 363/1000 [00:35<00:57, 11.00it/s]

Training_loss 1.96984
Training_loss 1.95883
Training_loss 1.95388


 36%|███▋      | 365/1000 [00:35<00:55, 11.36it/s]

Training_loss 1.94325
Training_loss 1.93767
Training_loss 1.93143


 37%|███▋      | 369/1000 [00:36<00:56, 11.24it/s]

Training_loss 1.92752
Training_loss 1.92334
Training_loss 1.91447


 37%|███▋      | 371/1000 [00:36<00:54, 11.53it/s]

Training_loss 1.90701
Training_loss 1.89359
Training_loss 1.88433


 38%|███▊      | 375/1000 [00:36<00:53, 11.74it/s]

Training_loss 1.87447
Training_loss 1.86373
Training_loss 1.85759


 38%|███▊      | 377/1000 [00:36<00:52, 11.77it/s]

Training_loss 1.84824
Training_loss 1.84296
Training_loss 1.83520


 38%|███▊      | 381/1000 [00:37<00:52, 11.68it/s]

Training_loss 1.82549
Training_loss 1.81726
Training_loss 1.80877


 38%|███▊      | 383/1000 [00:37<00:52, 11.86it/s]

Training_loss 1.79908
Training_loss 1.79462
Training_loss 1.78950


 39%|███▊      | 387/1000 [00:37<00:50, 12.19it/s]

Training_loss 1.78564
Training_loss 1.77870
Training_loss 1.77133


 39%|███▉      | 389/1000 [00:37<00:49, 12.32it/s]

Training_loss 1.76475
Training_loss 1.75852
Training_loss 1.75226


 39%|███▉      | 393/1000 [00:38<00:49, 12.33it/s]

Training_loss 1.74502
Training_loss 1.73775
Training_loss 1.73412


 40%|███▉      | 395/1000 [00:38<00:52, 11.43it/s]

Training_loss 1.72972
Training_loss 1.72419
Training_loss 1.72152


 40%|███▉      | 399/1000 [00:38<00:49, 12.10it/s]

Training_loss 1.71000
Training_loss 1.70749
Training_loss 1.70349


 40%|████      | 401/1000 [00:38<00:48, 12.28it/s]

Training_loss 1.69757
Training_loss 1.68832
Training_loss 1.68286


 40%|████      | 405/1000 [00:39<00:51, 11.58it/s]

Training_loss 1.67442
Training_loss 1.66939
Training_loss 1.65989


 41%|████      | 407/1000 [00:39<00:52, 11.25it/s]

Training_loss 1.65321
Training_loss 1.64411
Training_loss 1.63986


 41%|████      | 411/1000 [00:39<00:51, 11.47it/s]

Training_loss 1.63543
Training_loss 1.63156
Training_loss 1.62800


 41%|████▏     | 413/1000 [00:39<00:51, 11.49it/s]

Training_loss 1.61762
Training_loss 1.61352
Training_loss 1.61001


 42%|████▏     | 417/1000 [00:40<00:49, 11.78it/s]

Training_loss 1.60249
Training_loss 1.59771
Training_loss 1.59299


 42%|████▏     | 419/1000 [00:40<00:48, 11.86it/s]

Training_loss 1.58731
Training_loss 1.58125
Training_loss 1.57546


 42%|████▏     | 423/1000 [00:40<00:47, 12.25it/s]

Training_loss 1.56951
Training_loss 1.56196
Training_loss 1.55868


 42%|████▎     | 425/1000 [00:40<00:45, 12.51it/s]

Training_loss 1.55286
Training_loss 1.54930
Training_loss 1.54323


 43%|████▎     | 429/1000 [00:41<00:46, 12.38it/s]

Training_loss 1.53674
Training_loss 1.52758
Training_loss 1.52233


 43%|████▎     | 431/1000 [00:41<00:50, 11.34it/s]

Training_loss 1.51751
Training_loss 1.51225


 43%|████▎     | 433/1000 [00:41<00:50, 11.23it/s]

Training_loss 1.50636
Training_loss 1.50058
Training_loss 1.49858


 44%|████▎     | 437/1000 [00:41<00:47, 11.75it/s]

Training_loss 1.49311
Training_loss 1.48935
Training_loss 1.48404


 44%|████▍     | 439/1000 [00:42<00:46, 11.94it/s]

Training_loss 1.48004
Training_loss 1.47227
Training_loss 1.46561


 44%|████▍     | 443/1000 [00:42<00:48, 11.47it/s]

Training_loss 1.45904
Training_loss 1.45410
Training_loss 1.44962


 44%|████▍     | 445/1000 [00:42<00:47, 11.61it/s]

Training_loss 1.44118
Training_loss 1.43690
Training_loss 1.43235
Training_loss 1.42843


 45%|████▍     | 449/1000 [00:42<00:45, 12.11it/s]

Training_loss 1.42256
Training_loss 1.41922
Training_loss 1.41328


 45%|████▌     | 453/1000 [00:43<00:45, 11.95it/s]

Training_loss 1.40922
Training_loss 1.40205
Training_loss 1.39836


 46%|████▌     | 455/1000 [00:43<00:46, 11.79it/s]

Training_loss 1.39376
Training_loss 1.38840
Training_loss 1.38484


 46%|████▌     | 459/1000 [00:43<00:47, 11.35it/s]

Training_loss 1.37981
Training_loss 1.37426
Training_loss 1.37152


 46%|████▌     | 461/1000 [00:44<00:47, 11.29it/s]

Training_loss 1.36854
Training_loss 1.36101
Training_loss 1.35453


 46%|████▋     | 465/1000 [00:44<00:49, 10.86it/s]

Training_loss 1.34937
Training_loss 1.34716
Training_loss 1.34507


 47%|████▋     | 467/1000 [00:44<00:52, 10.19it/s]

Training_loss 1.33974
Training_loss 1.33631
Training_loss 1.33056


 47%|████▋     | 471/1000 [00:45<00:53,  9.89it/s]

Training_loss 1.32379
Training_loss 1.31985
Training_loss 1.31524


 47%|████▋     | 473/1000 [00:45<00:50, 10.43it/s]

Training_loss 1.31124
Training_loss 1.30517
Training_loss 1.30075


 48%|████▊     | 475/1000 [00:45<00:51, 10.27it/s]

Training_loss 1.29604
Training_loss 1.28892
Training_loss 1.28252


 48%|████▊     | 479/1000 [00:45<00:50, 10.32it/s]

Training_loss 1.27796
Training_loss 1.27509
Training_loss 1.26945


 48%|████▊     | 483/1000 [00:46<00:47, 10.93it/s]

Training_loss 1.26566
Training_loss 1.26232
Training_loss 1.25656


 48%|████▊     | 485/1000 [00:46<00:45, 11.26it/s]

Training_loss 1.25242
Training_loss 1.24842
Training_loss 1.24357


 49%|████▉     | 489/1000 [00:46<00:45, 11.27it/s]

Training_loss 1.24002
Training_loss 1.23610
Training_loss 1.23447


 49%|████▉     | 491/1000 [00:46<00:43, 11.71it/s]

Training_loss 1.23146
Training_loss 1.22802
Training_loss 1.22330


 50%|████▉     | 495/1000 [00:47<00:46, 10.87it/s]

Training_loss 1.21955
Training_loss 1.21463
Training_loss 1.20922


 50%|████▉     | 497/1000 [00:47<00:45, 11.01it/s]

Training_loss 1.20437
Training_loss 1.19977
Training_loss 1.19682


 50%|█████     | 501/1000 [00:47<00:44, 11.14it/s]

Training_loss 1.18984
Training_loss 1.18616
Training_loss 1.18161


 50%|█████     | 503/1000 [00:47<00:43, 11.34it/s]

Training_loss 1.17851
Training_loss 1.17325
Training_loss 1.17109


 51%|█████     | 507/1000 [00:48<00:41, 11.86it/s]

Training_loss 1.16660
Training_loss 1.16226
Training_loss 1.15840


 51%|█████     | 509/1000 [00:48<00:40, 12.12it/s]

Training_loss 1.15318
Training_loss 1.14794
Training_loss 1.14250


 51%|█████▏    | 513/1000 [00:48<00:39, 12.23it/s]

Training_loss 1.13880
Training_loss 1.13547
Training_loss 1.13064


 52%|█████▏    | 515/1000 [00:48<00:39, 12.26it/s]

Training_loss 1.12562
Training_loss 1.11964
Training_loss 1.11321


 52%|█████▏    | 517/1000 [00:49<00:40, 12.04it/s]

Training_loss 1.10990
Training_loss 1.10607


 52%|█████▏    | 519/1000 [00:49<00:43, 11.07it/s]

Training_loss 1.10109
Training_loss 1.09592


 52%|█████▏    | 523/1000 [00:49<00:43, 10.96it/s]

Training_loss 1.09293
Training_loss 1.08965
Training_loss 1.08580


 52%|█████▎    | 525/1000 [00:49<00:44, 10.72it/s]

Training_loss 1.08212
Training_loss 1.07953
Training_loss 1.07609


 53%|█████▎    | 529/1000 [00:50<00:40, 11.71it/s]

Training_loss 1.07315
Training_loss 1.06746
Training_loss 1.06450


 53%|█████▎    | 531/1000 [00:50<00:40, 11.66it/s]

Training_loss 1.06201
Training_loss 1.05741
Training_loss 1.05449


 54%|█████▎    | 535/1000 [00:50<00:41, 11.10it/s]

Training_loss 1.05054
Training_loss 1.04399
Training_loss 1.03770


 54%|█████▎    | 537/1000 [00:50<00:40, 11.29it/s]

Training_loss 1.03493
Training_loss 1.03091
Training_loss 1.02796


 54%|█████▍    | 541/1000 [00:51<00:39, 11.59it/s]

Training_loss 1.02476
Training_loss 1.02080
Training_loss 1.01681


 54%|█████▍    | 543/1000 [00:51<00:39, 11.68it/s]

Training_loss 1.01381
Training_loss 1.00990
Training_loss 1.00561


 55%|█████▍    | 547/1000 [00:51<00:39, 11.53it/s]

Training_loss 1.00179
Training_loss 0.99941
Training_loss 0.99435


 55%|█████▍    | 549/1000 [00:51<00:40, 11.09it/s]

Training_loss 0.98998
Training_loss 0.98712
Training_loss 0.98304


 55%|█████▌    | 553/1000 [00:52<00:38, 11.50it/s]

Training_loss 0.98012
Training_loss 0.97682
Training_loss 0.97126


 56%|█████▌    | 555/1000 [00:52<00:38, 11.55it/s]

Training_loss 0.96799
Training_loss 0.96526
Training_loss 0.96067


 56%|█████▌    | 559/1000 [00:52<00:37, 11.87it/s]

Training_loss 0.95949
Training_loss 0.95625
Training_loss 0.95238


 56%|█████▌    | 561/1000 [00:52<00:36, 12.18it/s]

Training_loss 0.94661
Training_loss 0.94285
Training_loss 0.93825


 56%|█████▋    | 565/1000 [00:53<00:35, 12.12it/s]

Training_loss 0.93653
Training_loss 0.93253
Training_loss 0.92441


 57%|█████▋    | 567/1000 [00:53<00:35, 12.15it/s]

Training_loss 0.91914
Training_loss 0.91615
Training_loss 0.91050


 57%|█████▋    | 571/1000 [00:53<00:34, 12.28it/s]

Training_loss 0.90440
Training_loss 0.90006
Training_loss 0.89594


 57%|█████▋    | 573/1000 [00:53<00:34, 12.22it/s]

Training_loss 0.89319
Training_loss 0.88786
Training_loss 0.88610


 58%|█████▊    | 577/1000 [00:54<00:34, 12.10it/s]

Training_loss 0.88385
Training_loss 0.88233
Training_loss 0.87946


 58%|█████▊    | 579/1000 [00:54<00:34, 12.20it/s]

Training_loss 0.87679
Training_loss 0.87335
Training_loss 0.86917


 58%|█████▊    | 583/1000 [00:54<00:36, 11.29it/s]

Training_loss 0.86698
Training_loss 0.86534
Training_loss 0.86286


 58%|█████▊    | 585/1000 [00:54<00:35, 11.55it/s]

Training_loss 0.85543
Training_loss 0.85170
Training_loss 0.84993


 59%|█████▉    | 589/1000 [00:55<00:35, 11.54it/s]

Training_loss 0.84900
Training_loss 0.84571
Training_loss 0.84222


 59%|█████▉    | 591/1000 [00:55<00:34, 11.72it/s]

Training_loss 0.84075
Training_loss 0.83514
Training_loss 0.83096


 60%|█████▉    | 595/1000 [00:55<00:33, 12.23it/s]

Training_loss 0.82806
Training_loss 0.82591
Training_loss 0.82274


 60%|█████▉    | 597/1000 [00:55<00:32, 12.43it/s]

Training_loss 0.81864
Training_loss 0.81523
Training_loss 0.80931


 60%|██████    | 601/1000 [00:56<00:33, 11.94it/s]

Training_loss 0.80660
Training_loss 0.80314
Training_loss 0.80020


 60%|██████    | 603/1000 [00:56<00:32, 12.31it/s]

Training_loss 0.79804
Training_loss 0.79656
Training_loss 0.79475


 61%|██████    | 607/1000 [00:56<00:32, 12.22it/s]

Training_loss 0.79234
Training_loss 0.78979
Training_loss 0.78837


 61%|██████    | 609/1000 [00:56<00:31, 12.28it/s]

Training_loss 0.78568
Training_loss 0.78263
Training_loss 0.77988


 61%|██████▏   | 613/1000 [00:57<00:31, 12.47it/s]

Training_loss 0.77665
Training_loss 0.77554
Training_loss 0.77174


 62%|██████▏   | 615/1000 [00:57<00:31, 12.35it/s]

Training_loss 0.76977
Training_loss 0.76713
Training_loss 0.76439


 62%|██████▏   | 619/1000 [00:57<00:32, 11.55it/s]

Training_loss 0.76016
Training_loss 0.75901
Training_loss 0.75728


 62%|██████▏   | 621/1000 [00:57<00:31, 12.06it/s]

Training_loss 0.75515
Training_loss 0.75230
Training_loss 0.74950


 62%|██████▎   | 625/1000 [00:58<00:33, 11.32it/s]

Training_loss 0.74711
Training_loss 0.74248
Training_loss 0.74110


 63%|██████▎   | 627/1000 [00:58<00:32, 11.56it/s]

Training_loss 0.73794
Training_loss 0.73421
Training_loss 0.73012


 63%|██████▎   | 631/1000 [00:58<00:31, 11.58it/s]

Training_loss 0.72849
Training_loss 0.72609
Training_loss 0.72299


 63%|██████▎   | 633/1000 [00:58<00:31, 11.50it/s]

Training_loss 0.71784
Training_loss 0.71598
Training_loss 0.71464


 64%|██████▎   | 637/1000 [00:59<00:30, 11.76it/s]

Training_loss 0.71349
Training_loss 0.71024
Training_loss 0.70745


 64%|██████▍   | 639/1000 [00:59<00:30, 11.87it/s]

Training_loss 0.70552
Training_loss 0.70317
Training_loss 0.70080


 64%|██████▍   | 643/1000 [00:59<00:31, 11.25it/s]

Training_loss 0.69935
Training_loss 0.69593
Training_loss 0.69207


 64%|██████▍   | 645/1000 [01:00<00:31, 11.28it/s]

Training_loss 0.68932
Training_loss 0.68654
Training_loss 0.68367


 65%|██████▍   | 647/1000 [01:00<00:31, 11.38it/s]

Training_loss 0.68176
Training_loss 0.67851


 65%|██████▌   | 651/1000 [01:00<00:32, 10.88it/s]

Training_loss 0.67495
Training_loss 0.67270
Training_loss 0.67003


 65%|██████▌   | 653/1000 [01:00<00:31, 10.87it/s]

Training_loss 0.66803
Training_loss 0.66567
Training_loss 0.66220


 66%|██████▌   | 657/1000 [01:01<00:31, 11.06it/s]

Training_loss 0.66047
Training_loss 0.65804
Training_loss 0.65485


 66%|██████▌   | 659/1000 [01:01<00:31, 10.97it/s]

Training_loss 0.65151
Training_loss 0.64975
Training_loss 0.64695


 66%|██████▋   | 663/1000 [01:01<00:29, 11.37it/s]

Training_loss 0.64242
Training_loss 0.63925
Training_loss 0.63622


 66%|██████▋   | 665/1000 [01:01<00:28, 11.59it/s]

Training_loss 0.63416
Training_loss 0.63238
Training_loss 0.63030


 67%|██████▋   | 669/1000 [01:02<00:27, 12.07it/s]

Training_loss 0.62935
Training_loss 0.62545
Training_loss 0.62377


 67%|██████▋   | 671/1000 [01:02<00:26, 12.44it/s]

Training_loss 0.62214
Training_loss 0.61986
Training_loss 0.61566


 68%|██████▊   | 675/1000 [01:02<00:28, 11.25it/s]

Training_loss 0.61488
Training_loss 0.61182
Training_loss 0.60856


 68%|██████▊   | 677/1000 [01:02<00:30, 10.48it/s]

Training_loss 0.60711
Training_loss 0.60406


 68%|██████▊   | 679/1000 [01:03<00:33,  9.65it/s]

Training_loss 0.60166
Training_loss 0.60027


 68%|██████▊   | 681/1000 [01:03<00:37,  8.44it/s]

Training_loss 0.59871
Training_loss 0.59779


 68%|██████▊   | 683/1000 [01:03<00:41,  7.73it/s]

Training_loss 0.59583
Training_loss 0.59292


 68%|██████▊   | 685/1000 [01:03<00:40,  7.81it/s]

Training_loss 0.59006
Training_loss 0.58804


 69%|██████▉   | 688/1000 [01:04<00:36,  8.45it/s]

Training_loss 0.58654
Training_loss 0.58361
Training_loss 0.58133


 69%|██████▉   | 690/1000 [01:04<00:39,  7.93it/s]

Training_loss 0.57959
Training_loss 0.57751


 69%|██████▉   | 692/1000 [01:04<00:33,  9.32it/s]

Training_loss 0.57590
Training_loss 0.57331
Training_loss 0.57020


 70%|██████▉   | 695/1000 [01:05<00:31,  9.70it/s]

Training_loss 0.56860
Training_loss 0.56537


 70%|██████▉   | 698/1000 [01:05<00:28, 10.54it/s]

Training_loss 0.56230
Training_loss 0.55872
Training_loss 0.55489


 70%|███████   | 700/1000 [01:05<00:28, 10.37it/s]

Training_loss 0.55254
Training_loss 0.55143
Training_loss 0.54974


 70%|███████   | 704/1000 [01:05<00:27, 10.96it/s]

Training_loss 0.54828
Training_loss 0.54553
Training_loss 0.54320


 71%|███████   | 706/1000 [01:06<00:26, 11.10it/s]

Training_loss 0.54124
Training_loss 0.53948
Training_loss 0.53751


 71%|███████   | 710/1000 [01:06<00:26, 10.91it/s]

Training_loss 0.53515
Training_loss 0.53328
Training_loss 0.53086


 71%|███████   | 712/1000 [01:06<00:25, 11.29it/s]

Training_loss 0.52909
Training_loss 0.52710


 71%|███████▏  | 714/1000 [01:06<00:28, 10.06it/s]

Training_loss 0.52619
Training_loss 0.52409


 72%|███████▏  | 716/1000 [01:07<00:28,  9.88it/s]

Training_loss 0.52216
Training_loss 0.52035
Training_loss 0.51831


 72%|███████▏  | 718/1000 [01:07<00:27, 10.09it/s]

Training_loss 0.51595
Training_loss 0.51408


 72%|███████▏  | 722/1000 [01:07<00:27, 10.29it/s]

Training_loss 0.51281
Training_loss 0.51093
Training_loss 0.50834


 72%|███████▏  | 724/1000 [01:07<00:25, 10.93it/s]

Training_loss 0.50719
Training_loss 0.50455
Training_loss 0.50337


 73%|███████▎  | 728/1000 [01:08<00:25, 10.51it/s]

Training_loss 0.50135
Training_loss 0.49967
Training_loss 0.49873


 73%|███████▎  | 730/1000 [01:08<00:24, 11.17it/s]

Training_loss 0.49605
Training_loss 0.49504
Training_loss 0.49253


 73%|███████▎  | 734/1000 [01:08<00:22, 11.86it/s]

Training_loss 0.48967
Training_loss 0.48802
Training_loss 0.48707


 74%|███████▎  | 736/1000 [01:08<00:23, 11.39it/s]

Training_loss 0.48527
Training_loss 0.48400
Training_loss 0.48206


 74%|███████▍  | 740/1000 [01:09<00:22, 11.75it/s]

Training_loss 0.47961
Training_loss 0.47879
Training_loss 0.47701


 74%|███████▍  | 742/1000 [01:09<00:21, 11.80it/s]

Training_loss 0.47495
Training_loss 0.47380
Training_loss 0.47238


 75%|███████▍  | 746/1000 [01:09<00:21, 11.89it/s]

Training_loss 0.47087
Training_loss 0.47019
Training_loss 0.46905


 75%|███████▍  | 748/1000 [01:09<00:21, 11.88it/s]

Training_loss 0.46850
Training_loss 0.46672
Training_loss 0.46496


 75%|███████▌  | 752/1000 [01:10<00:20, 12.04it/s]

Training_loss 0.46362
Training_loss 0.46221
Training_loss 0.46088


 75%|███████▌  | 754/1000 [01:10<00:20, 11.95it/s]

Training_loss 0.45891
Training_loss 0.45628
Training_loss 0.45448


 76%|███████▌  | 758/1000 [01:10<00:19, 12.14it/s]

Training_loss 0.45265
Training_loss 0.45098
Training_loss 0.44972


 76%|███████▌  | 760/1000 [01:10<00:19, 12.13it/s]

Training_loss 0.44666
Training_loss 0.44504
Training_loss 0.44361


 76%|███████▋  | 764/1000 [01:11<00:21, 11.17it/s]

Training_loss 0.44227
Training_loss 0.44123
Training_loss 0.43931


 77%|███████▋  | 766/1000 [01:11<00:20, 11.42it/s]

Training_loss 0.43779
Training_loss 0.43600
Training_loss 0.43383


 77%|███████▋  | 770/1000 [01:11<00:20, 11.42it/s]

Training_loss 0.43234
Training_loss 0.42929
Training_loss 0.42712


 77%|███████▋  | 772/1000 [01:11<00:20, 11.18it/s]

Training_loss 0.42586
Training_loss 0.42427
Training_loss 0.42257


 78%|███████▊  | 776/1000 [01:12<00:19, 11.59it/s]

Training_loss 0.42188
Training_loss 0.42095
Training_loss 0.41907


 78%|███████▊  | 778/1000 [01:12<00:20, 10.59it/s]

Training_loss 0.41727
Training_loss 0.41615


 78%|███████▊  | 780/1000 [01:12<00:22,  9.72it/s]

Training_loss 0.41486
Training_loss 0.41313


 78%|███████▊  | 782/1000 [01:12<00:23,  9.39it/s]

Training_loss 0.41215
Training_loss 0.41051


 78%|███████▊  | 784/1000 [01:13<00:24,  8.93it/s]

Training_loss 0.40899
Training_loss 0.40725


 79%|███████▊  | 786/1000 [01:13<00:24,  8.63it/s]

Training_loss 0.40463
Training_loss 0.40352


 79%|███████▉  | 789/1000 [01:13<00:21,  9.90it/s]

Training_loss 0.40095
Training_loss 0.39941
Training_loss 0.39805


 79%|███████▉  | 791/1000 [01:13<00:20, 10.43it/s]

Training_loss 0.39630
Training_loss 0.39523
Training_loss 0.39404


 80%|███████▉  | 795/1000 [01:14<00:18, 11.13it/s]

Training_loss 0.39185
Training_loss 0.38972
Training_loss 0.38758


 80%|███████▉  | 797/1000 [01:14<00:18, 11.15it/s]

Training_loss 0.38623
Training_loss 0.38497
Training_loss 0.38380


 80%|████████  | 801/1000 [01:14<00:16, 11.83it/s]

Training_loss 0.38248
Training_loss 0.38159
Training_loss 0.38083


 80%|████████  | 803/1000 [01:14<00:16, 11.93it/s]

Training_loss 0.37949
Training_loss 0.37826
Training_loss 0.37746


 81%|████████  | 807/1000 [01:15<00:17, 11.32it/s]

Training_loss 0.37660
Training_loss 0.37595
Training_loss 0.37483


 81%|████████  | 809/1000 [01:15<00:16, 11.56it/s]

Training_loss 0.37339
Training_loss 0.37170
Training_loss 0.37085


 81%|████████▏ | 813/1000 [01:15<00:16, 11.22it/s]

Training_loss 0.37029
Training_loss 0.36900
Training_loss 0.36765


 82%|████████▏ | 815/1000 [01:16<00:16, 11.04it/s]

Training_loss 0.36671
Training_loss 0.36531
Training_loss 0.36383


 82%|████████▏ | 819/1000 [01:16<00:16, 11.06it/s]

Training_loss 0.36329
Training_loss 0.36146
Training_loss 0.36039


 82%|████████▏ | 821/1000 [01:16<00:16, 10.84it/s]

Training_loss 0.35911
Training_loss 0.35755
Training_loss 0.35675


 82%|████████▏ | 823/1000 [01:16<00:16, 10.56it/s]

Training_loss 0.35534
Training_loss 0.35363


 82%|████████▎ | 825/1000 [01:16<00:16, 10.41it/s]

Training_loss 0.35182
Training_loss 0.35073


 83%|████████▎ | 827/1000 [01:17<00:16, 10.29it/s]

Training_loss 0.34962
Training_loss 0.34843


 83%|████████▎ | 831/1000 [01:17<00:16, 10.25it/s]

Training_loss 0.34603
Training_loss 0.34438
Training_loss 0.34271


 83%|████████▎ | 833/1000 [01:17<00:15, 10.75it/s]

Training_loss 0.34154
Training_loss 0.34101
Training_loss 0.33853


 84%|████████▎ | 837/1000 [01:18<00:14, 11.43it/s]

Training_loss 0.33783
Training_loss 0.33700
Training_loss 0.33581


 84%|████████▍ | 839/1000 [01:18<00:13, 11.57it/s]

Training_loss 0.33507
Training_loss 0.33402
Training_loss 0.33309


 84%|████████▍ | 843/1000 [01:18<00:13, 11.23it/s]

Training_loss 0.33182
Training_loss 0.33008
Training_loss 0.32894


 84%|████████▍ | 845/1000 [01:18<00:13, 11.91it/s]

Training_loss 0.32750
Training_loss 0.32659
Training_loss 0.32577


 85%|████████▍ | 849/1000 [01:19<00:12, 11.70it/s]

Training_loss 0.32494
Training_loss 0.32406
Training_loss 0.32216


 85%|████████▌ | 851/1000 [01:19<00:13, 10.86it/s]

Training_loss 0.32157
Training_loss 0.32022


 85%|████████▌ | 853/1000 [01:19<00:13, 10.77it/s]

Training_loss 0.31940
Training_loss 0.31854
Training_loss 0.31751


 86%|████████▌ | 857/1000 [01:19<00:12, 11.38it/s]

Training_loss 0.31692
Training_loss 0.31560
Training_loss 0.31451


 86%|████████▌ | 859/1000 [01:20<00:12, 11.28it/s]

Training_loss 0.31321
Training_loss 0.31229
Training_loss 0.31169


 86%|████████▋ | 863/1000 [01:20<00:12, 11.35it/s]

Training_loss 0.31101
Training_loss 0.30999
Training_loss 0.30842


 86%|████████▋ | 865/1000 [01:20<00:11, 11.39it/s]

Training_loss 0.30760
Training_loss 0.30618
Training_loss 0.30509


 87%|████████▋ | 869/1000 [01:20<00:11, 11.75it/s]

Training_loss 0.30443
Training_loss 0.30357
Training_loss 0.30286


 87%|████████▋ | 871/1000 [01:21<00:10, 11.86it/s]

Training_loss 0.30153
Training_loss 0.29932
Training_loss 0.29829


 88%|████████▊ | 875/1000 [01:21<00:10, 11.73it/s]

Training_loss 0.29752
Training_loss 0.29691
Training_loss 0.29542


 88%|████████▊ | 877/1000 [01:21<00:10, 11.27it/s]

Training_loss 0.29428
Training_loss 0.29342
Training_loss 0.29267


 88%|████████▊ | 881/1000 [01:21<00:10, 11.50it/s]

Training_loss 0.29206
Training_loss 0.29114
Training_loss 0.28930


 88%|████████▊ | 883/1000 [01:22<00:10, 11.59it/s]

Training_loss 0.28818
Training_loss 0.28712
Training_loss 0.28611


 89%|████████▊ | 887/1000 [01:22<00:09, 11.57it/s]

Training_loss 0.28569
Training_loss 0.28465
Training_loss 0.28347


 89%|████████▉ | 889/1000 [01:22<00:09, 11.43it/s]

Training_loss 0.28211
Training_loss 0.28094
Training_loss 0.27967


 89%|████████▉ | 893/1000 [01:22<00:09, 11.65it/s]

Training_loss 0.27862
Training_loss 0.27796
Training_loss 0.27691


 90%|████████▉ | 895/1000 [01:23<00:09, 11.66it/s]

Training_loss 0.27587
Training_loss 0.27495
Training_loss 0.27423


 90%|████████▉ | 899/1000 [01:23<00:09, 11.06it/s]

Training_loss 0.27347
Training_loss 0.27235
Training_loss 0.27186


 90%|█████████ | 901/1000 [01:23<00:09, 10.99it/s]

Training_loss 0.27047
Training_loss 0.26963
Training_loss 0.26870


 90%|█████████ | 905/1000 [01:24<00:08, 11.70it/s]

Training_loss 0.26751
Training_loss 0.26624
Training_loss 0.26505


 91%|█████████ | 907/1000 [01:24<00:07, 11.91it/s]

Training_loss 0.26432
Training_loss 0.26359
Training_loss 0.26270


 91%|█████████ | 911/1000 [01:24<00:07, 12.01it/s]

Training_loss 0.26210
Training_loss 0.26117
Training_loss 0.26014


 91%|█████████▏| 913/1000 [01:24<00:07, 11.95it/s]

Training_loss 0.25912
Training_loss 0.25787
Training_loss 0.25696


 92%|█████████▏| 917/1000 [01:25<00:07, 11.78it/s]

Training_loss 0.25556
Training_loss 0.25387
Training_loss 0.25301


 92%|█████████▏| 919/1000 [01:25<00:07, 11.45it/s]

Training_loss 0.25236
Training_loss 0.25165
Training_loss 0.25050


 92%|█████████▏| 923/1000 [01:25<00:06, 11.80it/s]

Training_loss 0.24979
Training_loss 0.24887
Training_loss 0.24805


 92%|█████████▎| 925/1000 [01:25<00:06, 11.11it/s]

Training_loss 0.24646
Training_loss 0.24551
Training_loss 0.24495


 93%|█████████▎| 928/1000 [01:26<00:08,  8.97it/s]

Training_loss 0.24430
Training_loss 0.24349


 93%|█████████▎| 930/1000 [01:26<00:08,  8.57it/s]

Training_loss 0.24252
Training_loss 0.24164


 93%|█████████▎| 933/1000 [01:26<00:07,  9.20it/s]

Training_loss 0.24103
Training_loss 0.24046
Training_loss 0.24013


 93%|█████████▎| 934/1000 [01:26<00:07,  8.27it/s]

Training_loss 0.23897


 94%|█████████▎| 936/1000 [01:27<00:10,  6.28it/s]

Training_loss 0.23788
Training_loss 0.23652


 94%|█████████▍| 938/1000 [01:27<00:08,  7.52it/s]

Training_loss 0.23568
Training_loss 0.23497


 94%|█████████▍| 940/1000 [01:27<00:07,  7.87it/s]

Training_loss 0.23400
Training_loss 0.23311


 94%|█████████▍| 941/1000 [01:27<00:07,  8.01it/s]

Training_loss 0.23220
Training_loss 0.23071


 94%|█████████▍| 944/1000 [01:28<00:06,  8.50it/s]

Training_loss 0.22917
Training_loss 0.22873
Training_loss 0.22774


 95%|█████████▍| 948/1000 [01:28<00:05, 10.04it/s]

Training_loss 0.22688
Training_loss 0.22585
Training_loss 0.22472


 95%|█████████▌| 950/1000 [01:28<00:04, 10.64it/s]

Training_loss 0.22414
Training_loss 0.22320
Training_loss 0.22286


 95%|█████████▌| 954/1000 [01:29<00:03, 11.64it/s]

Training_loss 0.22180
Training_loss 0.22109
Training_loss 0.22071


 96%|█████████▌| 956/1000 [01:29<00:03, 11.78it/s]

Training_loss 0.22012
Training_loss 0.21969
Training_loss 0.21921


 96%|█████████▌| 960/1000 [01:29<00:03, 12.78it/s]

Training_loss 0.21835
Training_loss 0.21737
Training_loss 0.21687


 96%|█████████▌| 962/1000 [01:29<00:03, 12.51it/s]

Training_loss 0.21605
Training_loss 0.21589
Training_loss 0.21530


 97%|█████████▋| 966/1000 [01:30<00:02, 11.34it/s]

Training_loss 0.21471
Training_loss 0.21409
Training_loss 0.21343


 97%|█████████▋| 968/1000 [01:30<00:02, 11.02it/s]

Training_loss 0.21270
Training_loss 0.21186
Training_loss 0.21131


 97%|█████████▋| 972/1000 [01:30<00:02, 11.58it/s]

Training_loss 0.21056
Training_loss 0.20985
Training_loss 0.20922


 97%|█████████▋| 974/1000 [01:30<00:02, 11.75it/s]

Training_loss 0.20797
Training_loss 0.20686
Training_loss 0.20579


 98%|█████████▊| 978/1000 [01:31<00:01, 12.12it/s]

Training_loss 0.20505
Training_loss 0.20347
Training_loss 0.20289


 98%|█████████▊| 980/1000 [01:31<00:01, 12.16it/s]

Training_loss 0.20262
Training_loss 0.20146
Training_loss 0.20064


 98%|█████████▊| 984/1000 [01:31<00:01, 11.06it/s]

Training_loss 0.20009
Training_loss 0.19968
Training_loss 0.19896


 99%|█████████▊| 986/1000 [01:31<00:01, 10.90it/s]

Training_loss 0.19808
Training_loss 0.19756
Training_loss 0.19640


 99%|█████████▉| 990/1000 [01:32<00:00, 11.17it/s]

Training_loss 0.19522
Training_loss 0.19468
Training_loss 0.19400


 99%|█████████▉| 992/1000 [01:32<00:00, 10.11it/s]

Training_loss 0.19310
Training_loss 0.19243
Training_loss 0.19166


100%|█████████▉| 996/1000 [01:32<00:00, 10.66it/s]

Training_loss 0.19093
Training_loss 0.19022
Training_loss 0.18882


100%|█████████▉| 998/1000 [01:33<00:00, 11.05it/s]

Training_loss 0.18774
Training_loss 0.18706
Training_loss 0.18640


100%|██████████| 1000/1000 [01:33<00:00, 10.73it/s]

Training_loss 0.18583





In [18]:
plot.plot(test_loss)

NameError: name 'plot' is not defined