# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"

ALL_CATEGORIES = ['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK',
       'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING',
       'PARENTS', 'THE WORLDPOST', 'WEDDINGS', 'WOMEN', 'IMPACT', 'DIVORCE',
       'CRIME', 'MEDIA', 'WEIRD NEWS', 'GREEN', 'WORLDPOST', 'RELIGION',
       'STYLE', 'SCIENCE', 'WORLD NEWS', 'TASTE', 'TECH', 'MONEY', 'ARTS',
       'FIFTY', 'GOOD NEWS', 'ARTS & CULTURE', 'ENVIRONMENT', 'COLLEGE',
       'LATINO VOICES', 'CULTURE & ARTS', 'EDUCATION']

# REQUIRED_CATEGORIES = ['RELIGION','SCIENCE', 'TASTE','PARENTING' , 'COLLEGE' ,'POLITICS' ]
REQUIRED_CATEGORIES = (np.array(ALL_CATEGORIES)[::3]).tolist()
print (REQUIRED_CATEGORIES)
NUM_CATEGORIES = len(REQUIRED_CATEGORIES)
    
    
CrossEntropyLoss = "CrossEntropyLoss"
MSELoss = "MSELoss"

['POLITICS', 'TRAVEL', 'HEALTHY LIVING', 'BUSINESS', 'BLACK VOICES', 'THE WORLDPOST', 'IMPACT', 'MEDIA', 'WORLDPOST', 'SCIENCE', 'TECH', 'FIFTY', 'ENVIRONMENT', 'CULTURE & ARTS']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# set loss type

In [2]:
loss_name = CrossEntropyLoss
# loss_name = MSELoss

# helper functions

In [3]:
def sparse_to_matrix(A):
    if type(A) == scipy.sparse.csr.csr_matrix:
        return np.array(A.todense())
    return A


In [4]:
def confusion_matrix_disp(y_pred,y_true,label = False):
    cm = confusion_matrix(y_pred,y_true)
    if not label:
        label = range(NUM_CATEGORIES)
    cm_pd = pd.DataFrame(cm,index = ["{}_P".format(i)  for i in label],columns = ["{}_T".format(i)  for i in label])
    return cm_pd

# def accuracy_test_dummies(model, x, y, data_set_name = 'test',print_sample = False):
#     predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
#     truth = np.argmax(y, axis=-1)
#     # print(np.array((predicted, truth)))
#     print (f"Accuracy {data_set_name} = ",
#            round( np.array(predicted == truth).mean()* 100, 3 ),
#            "%")
#     if print_sample:
#         ps = print_sample
#         sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
#         print (sample)
#         print (pd.value_counts(predicted))
#         print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))
        
# def accuracy_test_classes(model, x, y, data_set_name = 'test',print_sample = False):
#     predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
#     truth = y.ravel()
#     # print(np.array((predicted, truth)))
#     print (f"Accuracy {data_set_name} = ",
#            round( np.array(predicted == truth).mean()* 100, 3 ),
#            "%")
#     if print_sample:
#         ps = print_sample
#         sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
#         print (sample)
#         print (pd.value_counts(predicted))
#         print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))

def accuracy_test(model, x, y, data_set_name = 'test',print_sample = False,top_n_guess = 1):
          
    if loss_name == MSELoss:
        truth = np.argmax(y, axis=-1)
    elif loss_name == CrossEntropyLoss:
        truth = y.ravel()
    
    model_result = model(torch.tensor(x, dtype=torch.float))
    top_guesses = torch.argsort( model_result , dim = -1, descending=True ).numpy() [:,:top_n_guess]
    # predicted   = torch.argmax ( model_result, dim=-1).numpy()
    true_predicted = (top_guesses == truth[:,None]).any(1)
    
#     print ("model_result = ", model_result[:10])
#     print ("top_guesses =", top_guesses[:10])
#     print ("predicted = ", predicted[:10])

    print (f"Accuracy (top {top_n_guess} guesses) - {data_set_name} = ",
           round( np.array(true_predicted).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((top_guesses[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        # print (pd.value_counts(predicted))
    print (f"\tConfusion Matrix {data_set_name}:\n",confusion_matrix_disp (top_guesses[:,0],truth))
    
    
        
# if loss_name == MSELoss:
#     accuracy_test = accuracy_test_dummies
# elif loss_name == CrossEntropyLoss:
#     accuracy_test = accuracy_test_classes


## load data

In [5]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

# reduce dataset to n categories:

In [6]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for desired categories, to make problem easier
filter_categories = True
if filter_categories:
    filter_index =  categories.isin(REQUIRED_CATEGORIES)
    dataset   = dataset[filter_index]
    headlines = np.array(headlines)[filter_index]
    headlines_orig = np.array(headlines_orig)[filter_index]
    
else:
    NUM_CATEGORIES = len(set(categories))
    


In [8]:
def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(REQUIRED_CATEGORIES):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

categories = dataset['category']
if loss_name == CrossEntropyLoss:  
    Y = categories_to_index(categories)[:,np.newaxis]
else:
    Y  = np.array(pd.get_dummies(categories)) 

In [9]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, test_size = 0.3)

In [32]:
# extract features (Bag Of Words) using Vectorizer

max_features=2000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

# build Model

In [33]:
def torch_net(X_train, Y_train, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=17):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print('lr', param_group['lr'])
    
    # hiden_layers = [size1,size2...]

        
    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_train.shape
    
    if loss_name == MSELoss: 
        D_out = Y_train.shape[-1] 
    elif loss_name == CrossEntropyLoss: 
        D_out = NUM_CATEGORIES

    # Create random input and output data

    [X_train, Y_train, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_train, Y_train, X_test, Y_test]]

    X = torch.tensor(X_train, device=device, dtype=dtype)
    if loss_name == CrossEntropyLoss:
        y_dtype = torch.int64
    else:
        y_dtype = dtype
    Y = torch.tensor(Y_train, device=device, dtype=y_dtype)
#     print (Y)

    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(dims[-2], D_out))   
    
    model = torch.nn.Sequential(*layers)
    if loss_name == CrossEntropyLoss:
        weights = (np.power(1/pd.value_counts(Y.tolist(),normalize=True),1.2)).to_list()
        weights = torch.tensor(weights)
        loss_fn = torch.nn.CrossEntropyLoss(reduction='mean', weight=weights)     
    elif loss_name == MSELoss:
        loss_fn = torch.nn.MSELoss(reduction='mean') 

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.005
    weight_decay = 0.001
    lr_decay = 0.9

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10000, verbose=True, threshold=0.0001,
                                  threshold_mode='rel', cooldown=2000, min_lr=1e-10, eps=1e-08)
    
    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Start training:")
    print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 10, top_n_guess= 1 )
    # accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 3)
    
    epoch_lr = learning_rate
    min_loss = 100.0
    i_iter =-1
    for e in range(epoch):
        for t,(x_batch, y_batch) in enumerate(dataloader):
            i_iter +=1
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)
#             print ("x_batch.shape", x_batch.shape)
#             print ("y_batch.shape", y_batch.shape)
#             print ("y_pred.shape", y_pred.shape)
#             print ("y_batch[0,0]", y_batch[0,0])
#             print ("y_pred[0,0]", y_pred[0,0])
            y_pred_soft = torch.nn.functional.softmax(y_pred, dim = -1)
            # Compute and print loss.
#             batch_class_weights = torch.ones((y_batch.dim()))
#             loss = loss_fn(y_pred_soft, y_batch, weight = batch_class_weights)
#             loss = loss_fn(y_pred_soft, y_batch)
#             print (y_batch)
            loss = loss_fn(y_pred, y_batch)
            if loss < min_loss:
                print (f"loss < min_loss, updating min_loss to {loss}, i_iter {i_iter}")
#                 accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
#                 accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)
                min_loss = loss
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
            
            scheduler.step(loss)
            
            
        if not ((e+1) %10): 
            accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
            accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)
        
#         epoch_lr = epoch_lr * lr_decay
        print_learning_rate(optimizer)
        print(f"epoch-{e+1}, loss {round(loss.item(),3)}")
        print("------------------------------------")
        # set_learning_rate(optimizer,epoch_lr)
        
    print ("DONE, returning model")
    return model

print ("updated torch nn")

updated torch nn


# Train the model

In [34]:
# train model using pytorch
# model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,50,50],epoch=1000)
model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,70,50],epoch=100)


Start training:
loss name is:  CrossEntropyLoss  Using loss function:  CrossEntropyLoss()
Accuracy (top 1 guesses) - test =  3.727 %
                                                           0
predicted  [[2], [8], [8], [2], [8], [7], [7], [7], [8], ...
truth                        [0, 10, 8, 0, 0, 8, 7, 1, 0, 0]
	Confusion Matrix test:
        0_T   1_T   2_T  3_T  4_T  5_T  6_T  7_T  8_T  9_T  10_T  11_T  12_T  \
0_P      3     1     1    0    0    0    0    0    0    1     0     0     0   
1_P      0     0     0    0    0    0    0    0    0    0     0     0     0   
2_P   1325   407   283  227  217  159  152  115  103   89    83    64    53   
3_P      0     0     0    0    0    0    0    0    0    0     0     0     0   
4_P      0     0     0    0    0    0    0    0    0    0     0     0     0   
5_P      0     0     0    0    0    0    0    0    0    0     0     0     0   
6_P      0     0     0    0    0    0    0    0    0    0     0     0     0   
7_P   2873   866   587  518

lr 0.0045000000000000005
epoch-14, loss 2.218
------------------------------------
iter-2000, loss 2.375
Epoch 48465: reducing learning rate of group 0 to 4.0500e-03.
lr 0.004050000000000001
epoch-15, loss 1.192
------------------------------------
iter-2000, loss 1.321
lr 0.004050000000000001
epoch-16, loss 1.421
------------------------------------
iter-2000, loss 1.371
lr 0.004050000000000001
epoch-17, loss 3.455
------------------------------------
iter-2000, loss 1.298
lr 0.004050000000000001
epoch-18, loss 0.368
------------------------------------
Epoch 60466: reducing learning rate of group 0 to 3.6450e-03.
iter-2000, loss 1.557
lr 0.0036450000000000007
epoch-19, loss 0.286
------------------------------------
iter-2000, loss 1.328
Accuracy (top 1 guesses) - train =  46.762 %
	Confusion Matrix train:
        0_T   1_T   2_T   3_T   4_T   5_T   6_T   7_T  8_T  9_T  10_T  11_T  \
0_P   9613     5    34   168   100    58    21   149   16    5    13     1   
1_P    294  4418   150 

Epoch 105932: reducing learning rate of group 0 to 2.6572e-03.
iter-2000, loss 1.71
lr 0.002657205000000001
epoch-33, loss 0.16
------------------------------------
iter-2000, loss 1.699
lr 0.002657205000000001
epoch-34, loss 1.967
------------------------------------
iter-2000, loss 1.568
lr 0.002657205000000001
epoch-35, loss 1.553
------------------------------------
iter-2000, loss 1.282
Epoch 117933: reducing learning rate of group 0 to 2.3915e-03.
lr 0.002391484500000001
epoch-36, loss 1.224
------------------------------------
iter-2000, loss 1.468
lr 0.002391484500000001
epoch-37, loss 0.546
------------------------------------
iter-2000, loss 1.336
lr 0.002391484500000001
epoch-38, loss 1.728
------------------------------------
iter-2000, loss 0.974
lr 0.002391484500000001
epoch-39, loss 0.274
------------------------------------
Epoch 129934: reducing learning rate of group 0 to 2.1523e-03.
iter-2000, loss 1.042
Accuracy (top 1 guesses) - train =  45.961 %
	Confusion Matrix 

lr 0.001569052980450001
epoch-51, loss 2.045
------------------------------------
iter-2000, loss 0.929
lr 0.001569052980450001
epoch-52, loss 0.658
------------------------------------
iter-2000, loss 1.294
lr 0.001569052980450001
epoch-53, loss 3.155
------------------------------------
iter-2000, loss 0.705
Epoch 177938: reducing learning rate of group 0 to 1.4121e-03.
lr 0.0014121476824050009
epoch-54, loss 1.722
------------------------------------
iter-2000, loss 1.463
lr 0.0014121476824050009
epoch-55, loss 1.093
------------------------------------
iter-2000, loss 1.603
lr 0.0014121476824050009
epoch-56, loss 1.296
------------------------------------
iter-2000, loss 1.805
lr 0.0014121476824050009
epoch-57, loss 1.139
------------------------------------
Epoch 189939: reducing learning rate of group 0 to 1.2709e-03.
iter-2000, loss 2.254
lr 0.0012709329141645008
epoch-58, loss 1.713
------------------------------------
iter-2000, loss 0.69
lr 0.0012709329141645008
epoch-59, los

iter-2000, loss 1.692
lr 0.0009265100944259213
epoch-71, loss 2.073
------------------------------------
iter-2000, loss 1.732
Epoch 237943: reducing learning rate of group 0 to 8.3386e-04.
lr 0.0008338590849833291
epoch-72, loss 3.395
------------------------------------
iter-2000, loss 0.866
lr 0.0008338590849833291
epoch-73, loss 3.385
------------------------------------
iter-2000, loss 1.187
lr 0.0008338590849833291
epoch-74, loss 1.269
------------------------------------
iter-2000, loss 1.179
lr 0.0008338590849833291
epoch-75, loss 0.817
------------------------------------
Epoch 249944: reducing learning rate of group 0 to 7.5047e-04.
iter-2000, loss 1.507
lr 0.0007504731764849962
epoch-76, loss 1.985
------------------------------------
iter-2000, loss 0.44
lr 0.0007504731764849962
epoch-77, loss 1.272
------------------------------------
iter-2000, loss 1.381
lr 0.0007504731764849962
epoch-78, loss 0.924
------------------------------------
iter-2000, loss 1.678
lr 0.00075047

Epoch 297948: reducing learning rate of group 0 to 4.9239e-04.
iter-2000, loss 1.269
lr 0.0004923854510918061
epoch-91, loss 0.271
------------------------------------
iter-2000, loss 1.492
lr 0.0004923854510918061
epoch-92, loss 0.794
------------------------------------
iter-2000, loss 0.746
lr 0.0004923854510918061
epoch-93, loss 1.248
------------------------------------
iter-2000, loss 0.894
Epoch 309949: reducing learning rate of group 0 to 4.4315e-04.
lr 0.00044314690598262546
epoch-94, loss 0.711
------------------------------------
iter-2000, loss 0.841
lr 0.00044314690598262546
epoch-95, loss 1.737
------------------------------------
iter-2000, loss 0.855
lr 0.00044314690598262546
epoch-96, loss 0.618
------------------------------------
iter-2000, loss 0.741
loss < min_loss, updating min_loss to 0.0610586442053318, i_iter 320875
lr 0.00044314690598262546
epoch-97, loss 0.061
------------------------------------
iter-2000, loss 1.531
lr 0.00044314690598262546
epoch-98, loss 

iter-2000, loss 0.314
lr 0.00032305409446133396
epoch-111, loss 0.236
------------------------------------
Epoch 368110: reducing learning rate of group 0 to 2.9075e-04.
iter-2000, loss 0.422
lr 0.00029074868501520056
epoch-112, loss 0.5
------------------------------------
iter-2000, loss 1.338
lr 0.00029074868501520056
epoch-113, loss 0.704
------------------------------------
iter-2000, loss 0.715
lr 0.00029074868501520056
epoch-114, loss 0.92
------------------------------------
iter-2000, loss 0.878
Epoch 380111: reducing learning rate of group 0 to 2.6167e-04.
lr 0.00026167381651368053
epoch-115, loss 0.166
------------------------------------
iter-2000, loss 0.893
lr 0.00026167381651368053
epoch-116, loss 1.099
------------------------------------
iter-2000, loss 0.812
lr 0.00026167381651368053
epoch-117, loss 0.167
------------------------------------
iter-2000, loss 0.704
lr 0.00026167381651368053
epoch-118, loss 0.26
------------------------------------
Epoch 392112: reducing

iter-2000, loss 1.114
lr 0.00017168419101462582
epoch-131, loss 0.179
------------------------------------
iter-2000, loss 0.444
lr 0.00017168419101462582
epoch-132, loss 0.183
------------------------------------
iter-2000, loss 0.224
lr 0.00017168419101462582
epoch-133, loss 1.274
------------------------------------
Epoch 440116: reducing learning rate of group 0 to 1.5452e-04.
iter-2000, loss 0.306
lr 0.00015451577191316325
epoch-134, loss 1.362
------------------------------------
iter-2000, loss 0.631
lr 0.00015451577191316325
epoch-135, loss 1.998
------------------------------------
iter-2000, loss 0.501
lr 0.00015451577191316325
epoch-136, loss 1.656
------------------------------------
iter-2000, loss 0.342
Epoch 452117: reducing learning rate of group 0 to 1.3906e-04.
lr 0.00013906419472184693
epoch-137, loss 0.912
------------------------------------
iter-2000, loss 0.456
lr 0.00013906419472184693
epoch-138, loss 1.323
------------------------------------
iter-2000, loss 0.

iter-2000, loss 1.058
lr 0.00010137779795222643
epoch-151, loss 0.423
------------------------------------
Epoch 500121: reducing learning rate of group 0 to 9.1240e-05.
iter-2000, loss 0.394
lr 9.124001815700379e-05
epoch-152, loss 0.149
------------------------------------
iter-2000, loss 0.477
lr 9.124001815700379e-05
epoch-153, loss 0.821
------------------------------------
iter-2000, loss 0.69
lr 9.124001815700379e-05
epoch-154, loss 0.439
------------------------------------
iter-2000, loss 0.49
Epoch 512122: reducing learning rate of group 0 to 8.2116e-05.
lr 8.211601634130342e-05
epoch-155, loss 0.896
------------------------------------
iter-2000, loss 0.618
lr 8.211601634130342e-05
epoch-156, loss 0.045
------------------------------------
iter-2000, loss 0.314
lr 8.211601634130342e-05
epoch-157, loss 0.282
------------------------------------
iter-2000, loss 0.214
lr 8.211601634130342e-05
epoch-158, loss 0.303
------------------------------------
Epoch 524123: reducing lear

iter-2000, loss 0.49
lr 5.9862575912810195e-05
epoch-171, loss 1.498
------------------------------------
iter-2000, loss 0.43
Epoch 567821: reducing learning rate of group 0 to 5.3876e-05.
lr 5.3876318321529174e-05
epoch-172, loss 0.232
------------------------------------
iter-2000, loss 0.245
lr 5.3876318321529174e-05
epoch-173, loss 0.017
------------------------------------
iter-2000, loss 0.998
lr 5.3876318321529174e-05
epoch-174, loss 0.158
------------------------------------
iter-2000, loss 0.75
lr 5.3876318321529174e-05
epoch-175, loss 1.214
------------------------------------
Epoch 579822: reducing learning rate of group 0 to 4.8489e-05.
iter-2000, loss 0.601
lr 4.848868648937626e-05
epoch-176, loss 0.142
------------------------------------
iter-2000, loss 0.503
lr 4.848868648937626e-05
epoch-177, loss 0.051
------------------------------------
iter-2000, loss 0.946
lr 4.848868648937626e-05
epoch-178, loss 0.009
------------------------------------
iter-2000, loss 0.263
Ep

iter-2000, loss 0.186
lr 3.1813427205679776e-05
epoch-191, loss 0.762
------------------------------------
iter-2000, loss 0.462
lr 3.1813427205679776e-05
epoch-192, loss 0.006
------------------------------------
iter-2000, loss 0.432
lr 3.1813427205679776e-05
epoch-193, loss 0.14
------------------------------------
Epoch 639827: reducing learning rate of group 0 to 2.8632e-05.
iter-2000, loss 0.356
lr 2.8632084485111798e-05
epoch-194, loss 0.105
------------------------------------
iter-2000, loss 1.306
lr 2.8632084485111798e-05
epoch-195, loss 2.229
------------------------------------
iter-2000, loss 0.434
lr 2.8632084485111798e-05
epoch-196, loss 0.06
------------------------------------
iter-2000, loss 0.21
lr 2.8632084485111798e-05
epoch-197, loss 1.513
------------------------------------
Epoch 651828: reducing learning rate of group 0 to 2.5769e-05.
iter-2000, loss 0.288
lr 2.576887603660062e-05
epoch-198, loss 2.139
------------------------------------
iter-2000, loss 0.715


iter-2000, loss 0.146
lr 1.878551063068185e-05
epoch-211, loss 0.094
------------------------------------
Epoch 699832: reducing learning rate of group 0 to 1.6907e-05.
iter-2000, loss 0.9
lr 1.6906959567613665e-05
epoch-212, loss 0.099
------------------------------------
iter-2000, loss 0.541
lr 1.6906959567613665e-05
epoch-213, loss 0.092
------------------------------------
iter-2000, loss 1.161
lr 1.6906959567613665e-05
epoch-214, loss 0.043
------------------------------------
iter-2000, loss 0.119
lr 1.6906959567613665e-05
epoch-215, loss 0.983
------------------------------------
Epoch 711833: reducing learning rate of group 0 to 1.5216e-05.
iter-2000, loss 0.322
lr 1.5216263610852298e-05
epoch-216, loss 0.468
------------------------------------
iter-2000, loss 0.357
lr 1.5216263610852298e-05
epoch-217, loss 0.284
------------------------------------
iter-2000, loss 0.51
lr 1.5216263610852298e-05
epoch-218, loss 2.076
------------------------------------
iter-2000, loss 0.258


iter-2000, loss 0.166
lr 9.983390555080193e-06
epoch-231, loss 0.12
------------------------------------
iter-2000, loss 0.364
lr 9.983390555080193e-06
epoch-232, loss 0.192
------------------------------------
iter-2000, loss 0.284
lr 9.983390555080193e-06
epoch-233, loss 1.385
------------------------------------
Epoch 771838: reducing learning rate of group 0 to 8.9851e-06.
iter-2000, loss 0.234
lr 8.985051499572174e-06
epoch-234, loss 0.211
------------------------------------
iter-2000, loss 0.135
lr 8.985051499572174e-06
epoch-235, loss 0.022
------------------------------------
iter-2000, loss 0.457
lr 8.985051499572174e-06
epoch-236, loss 0.34
------------------------------------
iter-2000, loss 0.36
Epoch 783839: reducing learning rate of group 0 to 8.0865e-06.
lr 8.086546349614957e-06
epoch-237, loss 0.242
------------------------------------
iter-2000, loss 0.135
lr 8.086546349614957e-06
epoch-238, loss 0.257
------------------------------------
iter-2000, loss 0.753
lr 8.08

iter-2000, loss 0.314
lr 5.895092288869304e-06
epoch-251, loss 0.339
------------------------------------
Epoch 831843: reducing learning rate of group 0 to 5.3056e-06.
iter-2000, loss 1.304
lr 5.305583059982374e-06
epoch-252, loss 3.543
------------------------------------
iter-2000, loss 0.384
lr 5.305583059982374e-06
epoch-253, loss 0.245
------------------------------------
iter-2000, loss 0.476
lr 5.305583059982374e-06
epoch-254, loss 0.719
------------------------------------
iter-2000, loss 0.201
lr 5.305583059982374e-06
epoch-255, loss 2.357
------------------------------------
Epoch 843844: reducing learning rate of group 0 to 4.7750e-06.
iter-2000, loss 0.473
lr 4.775024753984137e-06
epoch-256, loss 0.105
------------------------------------
iter-2000, loss 0.193
lr 4.775024753984137e-06
epoch-257, loss 1.043
------------------------------------
iter-2000, loss 0.112
lr 4.775024753984137e-06
epoch-258, loss 0.012
------------------------------------
iter-2000, loss 0.121
Epoc

iter-2000, loss 0.631
lr 3.1328937410889924e-06
epoch-271, loss 0.624
------------------------------------
iter-2000, loss 0.413
lr 3.1328937410889924e-06
epoch-272, loss 3.074
------------------------------------
iter-2000, loss 2.566
lr 3.1328937410889924e-06
epoch-273, loss 0.219
------------------------------------
Epoch 903849: reducing learning rate of group 0 to 2.8196e-06.
iter-2000, loss 0.182
lr 2.8196043669800934e-06
epoch-274, loss 0.203
------------------------------------
iter-2000, loss 0.667
lr 2.8196043669800934e-06
epoch-275, loss 0.295
------------------------------------
iter-2000, loss 0.39
lr 2.8196043669800934e-06
epoch-276, loss 0.049
------------------------------------
iter-2000, loss 0.26
Epoch 915850: reducing learning rate of group 0 to 2.5376e-06.
lr 2.537643930282084e-06
epoch-277, loss 0.161
------------------------------------
iter-2000, loss 0.372
lr 2.537643930282084e-06
epoch-278, loss 0.112
------------------------------------
iter-2000, loss 0.575


iter-2000, loss 0.332
lr 1.8499424251756394e-06
epoch-291, loss 0.011
------------------------------------
Epoch 963854: reducing learning rate of group 0 to 1.6649e-06.
iter-2000, loss 0.125
lr 1.6649481826580756e-06
epoch-292, loss 0.088
------------------------------------
iter-2000, loss 0.778
lr 1.6649481826580756e-06
epoch-293, loss 0.543
------------------------------------
iter-2000, loss 0.564
lr 1.6649481826580756e-06
epoch-294, loss 0.228
------------------------------------
iter-2000, loss 0.151
Epoch 975855: reducing learning rate of group 0 to 1.4985e-06.
lr 1.4984533643922681e-06
epoch-295, loss 0.393
------------------------------------
iter-2000, loss 0.744
lr 1.4984533643922681e-06
epoch-296, loss 0.014
------------------------------------
iter-2000, loss 0.222
lr 1.4984533643922681e-06
epoch-297, loss 0.259
------------------------------------
iter-2000, loss 0.094
lr 1.4984533643922681e-06
epoch-298, loss 1.153
------------------------------------
iter-2000, loss 0.

iter-2000, loss 0.236
lr 9.831352523777673e-07
epoch-311, loss 0.6
------------------------------------
iter-2000, loss 0.482
lr 9.831352523777673e-07
epoch-312, loss 0.334
------------------------------------
iter-2000, loss 0.184
lr 9.831352523777673e-07
epoch-313, loss 1.231
------------------------------------
Epoch 1035860: reducing learning rate of group 0 to 8.8482e-07.
iter-2000, loss 0.192
lr 8.848217271399906e-07
epoch-314, loss 0.022
------------------------------------
iter-2000, loss 0.772
lr 8.848217271399906e-07
epoch-315, loss 0.002
------------------------------------
iter-2000, loss 0.178
lr 8.848217271399906e-07
epoch-316, loss 0.008
------------------------------------
iter-2000, loss 0.347
Epoch 1047861: reducing learning rate of group 0 to 7.9634e-07.
lr 7.963395544259916e-07
epoch-317, loss 0.693
------------------------------------
iter-2000, loss 0.345
lr 7.963395544259916e-07
epoch-318, loss 0.339
------------------------------------
iter-2000, loss 0.707
lr 7

iter-2000, loss 0.703
lr 5.80531535176548e-07
epoch-331, loss 0.016
------------------------------------
Epoch 1095865: reducing learning rate of group 0 to 5.2248e-07.
iter-2000, loss 0.377
lr 5.224783816588932e-07
epoch-332, loss 0.422
------------------------------------
iter-2000, loss 0.336
lr 5.224783816588932e-07
epoch-333, loss 0.444
------------------------------------
iter-2000, loss 0.132
lr 5.224783816588932e-07
epoch-334, loss 0.093
------------------------------------
iter-2000, loss 0.404
Epoch 1107866: reducing learning rate of group 0 to 4.7023e-07.
lr 4.702305434930039e-07
epoch-335, loss 0.299
------------------------------------
iter-2000, loss 0.501
lr 4.702305434930039e-07
epoch-336, loss 0.259
------------------------------------
iter-2000, loss 0.519
lr 4.702305434930039e-07
epoch-337, loss 0.098
------------------------------------
iter-2000, loss 0.348
lr 4.702305434930039e-07
epoch-338, loss 0.1
------------------------------------
Epoch 1119867: reducing lea

iter-2000, loss 0.157
lr 3.0851825958575986e-07
epoch-351, loss 0.803
------------------------------------
iter-2000, loss 0.232
lr 3.0851825958575986e-07
epoch-352, loss 2.271
------------------------------------
iter-2000, loss 0.746
lr 3.0851825958575986e-07
epoch-353, loss 4.375
------------------------------------
Epoch 1167871: reducing learning rate of group 0 to 2.7767e-07.
iter-2000, loss 0.152
lr 2.776664336271839e-07
epoch-354, loss 0.399
------------------------------------
iter-2000, loss 0.331
lr 2.776664336271839e-07
epoch-355, loss 0.664
------------------------------------
iter-2000, loss 0.154
lr 2.776664336271839e-07
epoch-356, loss 1.823
------------------------------------
iter-2000, loss 0.374
Epoch 1179872: reducing learning rate of group 0 to 2.4990e-07.
lr 2.498997902644655e-07
epoch-357, loss 0.477
------------------------------------
iter-2000, loss 0.328
lr 2.498997902644655e-07
epoch-358, loss 0.082
------------------------------------
iter-2000, loss 0.496

KeyboardInterrupt: 