In [1]:
pip install torchtext

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import math
import torchtext
import sys
import datasets
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [4]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
print(dataset)

def has_at_least_n_words(example):
    return len(example['text'].split()) >= 10

dataset = dataset.filter(has_at_least_n_words)

print(dataset)
print(dataset['train'][8])
print(dataset['train'][8]['text'])

Found cached dataset wikitext (/home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-183251567eb1813b.arrow
Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-e110dbe66c3faf3b.arrow
Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-abee8172eaf0a427.arrow


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 2080
    })
    train: Dataset({
        features: ['text'],
        num_rows: 17034
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1841
    })
})
{'text': " Partly due to these events , and partly due to the major losses in manpower Gallia suffers towards the end of the war with the Empire , the Nameless are offered a formal position as a squad in the Gallian Army rather than serve as an anonymous shadow force . This is short @-@ lived , however , as following Maximilian 's defeat , Dahau and Calamity Raven move to activate an ancient Valkyrian super weapon within the Empire , kept secret by their benefactor . Without th

In [5]:
'''Tokenizing the Dataset'''
#tokenize -- means split the sentence into words
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
#this is the build-in map function of dataset
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][8]['tokens'])

Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-b2d3db9cfdb7ca0a.arrow
Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-72a4ce18ab49920b.arrow
Loading cached processed dataset at /home/dkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a25052d434f149c5.arrow


['partly', 'due', 'to', 'these', 'events', ',', 'and', 'partly', 'due', 'to', 'the', 'major', 'losses', 'in', 'manpower', 'gallia', 'suffers', 'towards', 'the', 'end', 'of', 'the', 'war', 'with', 'the', 'empire', ',', 'the', 'nameless', 'are', 'offered', 'a', 'formal', 'position', 'as', 'a', 'squad', 'in', 'the', 'gallian', 'army', 'rather', 'than', 'serve', 'as', 'an', 'anonymous', 'shadow', 'force', '.', 'this', 'is', 'short', '@-@', 'lived', ',', 'however', ',', 'as', 'following', 'maximilian', "'", 's', 'defeat', ',', 'dahau', 'and', 'calamity', 'raven', 'move', 'to', 'activate', 'an', 'ancient', 'valkyrian', 'super', 'weapon', 'within', 'the', 'empire', ',', 'kept', 'secret', 'by', 'their', 'benefactor', '.', 'without', 'the', 'support', 'of', 'maximilian', 'or', 'the', 'chance', 'to', 'prove', 'themselves', 'in', 'the', 'war', 'with', 'gallia', ',', 'it', 'is', 'dahau', "'", 's', 'last', 'trump', 'card', 'in', 'creating', 'a', 'new', 'darcsen', 'nation', '.', 'as', 'an', 'armed',

In [6]:
'''Constructing the Vocabulary'''
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=20) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)
#This means that when a token is not found in the vocabulary, it will be mapped to the <unk> token.            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       

8148
['<unk>', '<eos>', 'the', ',', '.', 'of', 'and', 'in', 'to', 'a']


In [7]:
#Implementing the Dataloaders
def get_data(dataset, vocab):
    data = [] 
    sumLength = 0   
    count = 0                                               
    for example in dataset:
        if example['tokens']:                                     
            example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.append(tokens)
            count += 1
            sumLength += len(tokens)
            if count >= 8000:
              break 

    print(sumLength/count)

    #for x_training, we include every words except the last one which is <eos>
    X_train = np.asarray([[token_index for token_index in sent[:-1]] for sent in data])
    #for y_training, we include every words except the first one since we don't predict the first one
    Y_train = np.asarray([[token_index for token_index in sent[1:]] for sent in data])
    
    print("X_train shape: " + str(X_train.shape))
    print("y_train shape: " + str(Y_train.shape))

    # Print an training data example
    x_example, y_example = X_train[17], Y_train[17]
    print("x:\n%s\n%s" % (" ".join([vocab.get_itos()[x] for x in x_example]), x_example))
    print("\ny:\n%s\n%s" % (" ".join([vocab.get_itos()[x] for x in y_example]), y_example))

    return X_train, Y_train                                       

In [8]:
'''mean of length of sentence is 120'''
X_train, Y_train = get_data(tokenized_dataset['train'], vocab)
# X_validation, Y_validation = get_data(tokenized_dataset['validation'], vocab)
'''mean of length of sentence is 115'''
X_test, Y_test = get_data(tokenized_dataset['test'], vocab)

# X_test = np.concatenate((X_validation, X_test_pre), axis=0)
# Y_test = np.concatenate((Y_validation, Y_test_pre), axis=0)

print(X_train.shape)
print(Y_train.shape)
# print(X_validation.shape)
# print(Y_validation.shape)
# print(X_test_pre.shape)
# print(Y_test_pre.shape)
print(X_test.shape)
print(Y_test.shape)

120.776625
X_train shape: (8000,)
y_train shape: (8000,)
x:
in a <unk> of the <unk> demo , ryan <unk> of ign was left excited as to where the game would go after completing the demo , along with <unk> the improved <unk> over valkyria chronicles ii . <unk> ' s richard <unk> was highly positive about the game , citing is story as a return to form after valkyria chronicles ii and its gameplay being the best in the series . his main <unk> were its length and gameplay <unk> , along with expressing <unk> that it would not be <unk> .
[7, 9, 0, 5, 2, 0, 7417, 3, 2945, 0, 5, 2512, 10, 250, 7240, 14, 8, 92, 2, 68, 63, 637, 45, 4150, 2, 7417, 3, 162, 18, 0, 2, 2060, 0, 66, 3877, 3894, 311, 4, 0, 11, 15, 1117, 0, 10, 1279, 923, 73, 2, 68, 3, 3217, 23, 329, 14, 9, 546, 8, 282, 45, 3877, 3894, 311, 6, 43, 2536, 96, 2, 175, 7, 2, 94, 4, 27, 255, 0, 29, 43, 850, 6, 2536, 0, 3, 162, 18, 7947, 0, 16, 24, 63, 40, 34, 0, 4]

y:
a <unk> of the <unk> demo , ryan <unk> of ign was left excited as to where the

  X_train = np.asarray([[token_index for token_index in sent[:-1]] for sent in data])
  Y_train = np.asarray([[token_index for token_index in sent[1:]] for sent in data])


115.16153846153846
X_train shape: (2080,)
y_train shape: (2080,)
x:
the an <unk> rebellion began in december <unk> , and was not completely <unk> for almost eight years . it caused enormous <unk> to chinese society the census of <unk> recorded 52 @ . @ 9 million people , but ten years later , the census <unk> just 16 @ . @ 9 million , the remainder having been displaced or killed . during this time , du <unk> led a largely <unk> life <unk> by wars , associated <unk> and imperial <unk> . this period of <unk> was the making of du <unk> as a poet even <unk> <unk> has written that , what he saw around him — the lives of his family , <unk> , and <unk> – what he heard , and what he hoped for or feared from the progress of various campaigns — these became the enduring themes of his poetry . even when he learned of the death of his youngest child , he turned to the suffering of others in his poetry instead of <unk> upon his own <unk> . du <unk> wrote
[2, 30, 0, 5056, 135, 7, 276, 0, 3, 6, 10, 

In [9]:
x_example, y_example = X_test[1086], Y_test[1086]
print("x:\n%s\n%s" % (" ".join([vocab.get_itos()[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([vocab.get_itos()[x] for x in y_example]), y_example))

x:
after defeating <unk> , donald receives a more complete map . in india , donald enters the palace of the <unk> , where she challenges him to defeat the tiger in her garden in exchange for a <unk> <unk> . donald <unk> and receives the <unk> <unk> , which is the key to open a temple in egypt . donald is able to solve the <unk> of the <unk> using the note <unk> had given him , and <unk> the <unk> of ra before <unk> in a mine <unk> . from there , he <unk> to the south pole , where he finds a key <unk> in ice , and uses the <unk> of ra to <unk> the ice and <unk> the key . the key <unk> the hold of a <unk> ship , which contains an ancient diary with the secret to <unk> the treasure . the ship is <unk> by <unk> , and the <unk> captain sends donald below <unk> to get <unk> of them . after defeating a <unk> <unk> warrior , donald returns to the deck , where the captain <unk> him that the diary is hidden in ice near the south pole , and gives him an ancient <unk> <unk> that <unk> to flying <u

In [10]:
class Tanh:
    def forward(self, x):
        return torch.tanh(x)

    def backwardWithTanhValue(self, tanh, top_diff):
        ## at this activation function layer, we should use * --- which is element-wise multiplication
        return (1.0 - torch.square(tanh)) * top_diff

class Softmax:
    def predict(self, mulv, b_y):
        x = mulv + b_y
        #softmax(x, dim=-1) The dim argument is required unless your input tensor is a vector
        return torch.softmax(x, dim = 0)

    def lossWithSoftmaxProb(self, probs, y):
        return -torch.log(probs[y])

    def diffWithSoftmaxProb(self, probs, y):
        probs[y] -= 1.0
        #return y^ - y
        return probs

class MultiplyGate:
    def forward(self, W, x):
        return torch.matmul(W, x)
    def backward(self, W, x, dz):
        #x is state
        #so we don't need to transpose it anymore, which is equavilent to dz * transpose(x)
        dW = torch.matmul(torch.transpose(dz.unsqueeze(0), 0, 1), x.unsqueeze(0))
        dx = torch.matmul(torch.transpose(W, 0, 1), dz)        
        return dW, dx

class AddGate:
    def forward(self, x1, x2):
        return x1 + x2

In [11]:
mulGate = MultiplyGate()
addGate = AddGate()
activation = Tanh()

class RNNLayer:
    #x is input -- word vector; s is hidden state vector, U, W, V is matrix
    def forward(self, x, prev_s, U, W, V, b_h):
        self.input = x
        self.mulu = mulGate.forward(U, x)
        self.mulw = mulGate.forward(W, prev_s)
        self.add = addGate.forward(self.mulw, self.mulu)
        self.add = addGate.forward(self.add, b_h)
        self.s = activation.forward(self.add)
        self.mulv = mulGate.forward(V, self.s)
    
    #all parameters are tensor
    def backward(self, prev_s, U, W, V, diff_s, dy_pred):
        d_by = dy_pred
        dV, dy_predV = mulGate.backward(V, self.s, dy_pred)
        # diff_s is not always a vector of 0 --- for back trancate, it is not. value gets acculumated
        ds = dy_predV + diff_s
        #optimization: replace self.add with self.s directly
        dadd = activation.backwardWithTanhValue(self.s, ds)
        d_bh = dadd
        '''no need of this add.backward step since the gradient of addition is 1'''
        # the usage of dprev_s???? -- used as ds in the back truncate (its value will be assigned to diff_s and the new dsv will be 0 since dmulv is 0)
        dW, dprev_s = mulGate.backward(W, prev_s, dadd)
        dU, dx = mulGate.backward(U, self.input, dadd)
        return (dprev_s, dU, dW, dV, d_by, d_bh)

In [12]:
class Model:
    def __init__(self, f, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        self.f = f
        
        self.dtype = torch.float64
        self.device = torch.device("cuda:0")
        self.tensor = torch.tensor((), dtype=self.dtype, device = self.device)

        self.U = torch.from_numpy(np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, word_dim))).to(self.device)
        self.W = torch.from_numpy(np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, hidden_dim))).to(self.device)
        self.V = torch.from_numpy(np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (word_dim, hidden_dim))).to(self.device)
        self.b_h = self.tensor.new_zeros(hidden_dim)
        #this b_y in fact represents the frequency of words in training dataset
        self.b_y = self.tensor.new_zeros(word_dim)
      
        #initial orthogonal matrices
        u, s, vh = torch.linalg.svd(self.W, full_matrices=False)
        self.W = u @ vh
        #full_matrices(bool, optional) : If True (default), u and vh have the shapes (…, M, M) and (…, N, N), respectively. 
        #Otherwise, the shapes are (…, M, K) and (…, K, N), respectively, where K = min(M, N).
    '''
    forward propagation (predicting word probabilities and calculate the loss/accuracy)
    for example x = [0, 179, 341, 416], then its y = [179, 341, 416, 1]
    x is a single sentence which is an array of indexes of words
    x[i] is the index of the word in the words vocabulary
    '''
    def forward_propagation(self, x, y):
        assert len(x) == len(y)
        # The total number of time steps
        T = len(x)
        layers = []
        prev_s = self.tensor.new_zeros(self.hidden_dim)
        output = Softmax()
        hit = 0
        loss = 0.0
        ys_pred_probs = []
        # For each time step in the sentence
        for t in range(T):
            layer = RNNLayer()
            #input still represents the input word as a probality vector in the vocabulary
            # change this input vector from np to tensor
            input = self.tensor.new_zeros(self.word_dim)
            '''no need of not using teacher forceing - or it will be tough for begining words'''
            input[x[t]] = 1
            layer.forward(input, prev_s, self.U, self.W, self.V, self.b_h)
            y_pred_prob = output.predict(layer.mulv, self.b_y)
            loss += output.lossWithSoftmaxProb(y_pred_prob, y[t])
            #output.predict -- softmax(mulv)
            if torch.argmax(y_pred_prob) == y[t]:
              hit += 1
            prev_s = layer.s

            ys_pred_probs.append(y_pred_prob)
            layers.append(layer)
        lss = loss / float(len(y))
        acc = hit / float(len(y))
        return lss, acc, layers, ys_pred_probs

    def bptt(self, y, layers, ys_pred_probs):
        output = Softmax()

        dU = self.tensor.new_zeros(self.U.shape)
        dV = self.tensor.new_zeros(self.V.shape)
        dW = self.tensor.new_zeros(self.W.shape)
        db_h = self.tensor.new_zeros(self.b_h.shape)
        db_y = self.tensor.new_zeros(self.b_y.shape)

        prev_s_t = self.tensor.new_zeros(self.hidden_dim)
        diff_s = self.tensor.new_zeros(self.hidden_dim)

        for t in range(len(layers)):
            #y^ - y -- dy_pred
            dy_pred = output.diffWithSoftmaxProb(ys_pred_probs[t], y[t])

            dprev_s, dU_t, dW_t, dV_t, d_by_t, d_bh_t = layers[t].backward(prev_s_t, self.U, self.W, self.V, diff_s, dy_pred)
            prev_s_t = layers[t].s

            # the reson of using this inner loop? -- sum up the impacts of s_t, s_t-1, s_t-2, s_t-3, S_t-4 on loss functions
            # the reason of initializing dmulv be 0 vector --- to make ds == dprev_s; since ds = ds + diff
            dy_hat = self.tensor.new_zeros(self.word_dim)
            #for i in range(t-1, max(-1, t-self.bptt_truncate-1), -1):
            for i in range(t-1, -1, -1):
                prev_s_i = self.tensor.new_zeros(self.hidden_dim) if i == 0 else layers[i-1].s
                dprev_s, dU_i, dW_i, dV_i, d_by_i, d_bh_i = layers[i].backward(prev_s_i, self.U, self.W, self.V, dprev_s, dy_hat)
                #sum up the impacts of s_t, s_t-1, s_t-2, s_t-3, S_t-4 on loss functions
                dU_t += dU_i
                dW_t += dW_i
                d_bh_t += d_bh_i

            dV += dV_t
            db_y += d_by_t
            dU += dU_t
            dW += dW_t
            db_h += d_bh_t

        return (dU, dW, dV, db_h, db_y)

    def sgd_step(self, x, y, learning_rate):
        #The dtype for an array of arrays will always be object. This is unavoidable because with NumPy only non-jagged n-dimensional arrays can be held in a contiguous memory block.
        #Notice your constituent arrays are already of int dtype:
        x = torch.tensor(x, dtype=torch.int32, device=self.device)
        y = torch.tensor(y, dtype=torch.int32, device=self.device)
        #x is a sentence (aka one example) composed of many words
        lss, acc, layers, ys_pred_probs = self.forward_propagation(x, y)
        dU, dW, dV, db_h, db_y = self.bptt(y, layers, ys_pred_probs)
        self.U -= learning_rate * dU
        self.V -= learning_rate * dV
        self.b_h -= learning_rate * db_h
        self.b_y -= learning_rate * db_y
        self.W -= learning_rate * dW
        #orthogonize W at every sgd
        u, s, vh = torch.linalg.svd(self.W, full_matrices=False)
        self.W = u @ vh
        
        return lss, acc
        #This is just for testing in this code
        # print("W's learning rate is %f"%(np.linalg.norm(learning_rate * dW)))
        # eigvals = np.linalg.eigvals(self.W)
        # largestValueIndex = np.argmax(eigvals)
        # print("W's largest eigenValue is %f"%(eigvals[largestValueIndex]))
        
    def train(self, X, Y, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):

        num_examples_seen = 0
        losses = []
        for epoch in range(nepoch):
            loss = 0
            accuracy = 0

            # For each training example...
            for i in range(len(Y)):
                '''sgd return the loss and accureacy of this sentence/example'''
                lss, acc = self.sgd_step(X[i], Y[i], learning_rate)
                loss += lss
                accuracy += acc
                num_examples_seen += 1

            if (epoch % evaluate_loss_after == 0):
                #1 place
                #loss, accuracy = self.calculate_total_loss_and_predict_accuracy(X, Y)
                loss /= len(Y)
                accuracy /= len(Y)
                losses.append(str(float(loss)))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
                f.write("%s: Loss after num_examples_seen=%d epoch=%d: %f\n" % (time, num_examples_seen, epoch, loss))
                #accuracy = self.calculate_total_pred_accuray(X, Y)
                print("Prediction accuracy=%f" % (accuracy))
                f.write("Prediction accuracy=%f\n" % (accuracy))
                f.flush()
                # optional: Adjust the learning rate if loss increases
                # if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                #     learning_rate = learning_rate * 0.5
                #     print("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
        return losses
    
    def justForwardPropagation(self, x, y):
        x = torch.tensor(x, dtype=torch.int32, device=self.device)
        y = torch.tensor(y, dtype=torch.int32, device=self.device)
        lss, acc, _, _ = self.forward_propagation(x, y)
        return lss, acc

    def calculate_total_pred_accuray(self, X, Y):
        # For each testing example...
        loss = 0
        accuracy = 0
        for i in range(len(Y)):
            lss, acc = self.justForwardPropagation(X[i], Y[i])
            loss += lss
            accuracy += acc
        loss /= len(Y)
        accuracy /= len(Y)
        print("Testing loss value=%f" % (loss))
        f.write("Testing loss value=%f\n" % (loss))
        print("Testing prediction accuracy=%f" % (accuracy))
        f.write("Testing prediction accuracy=%f\n" % (accuracy))
        f.flush()

In [13]:
hidden_dim = 100
word_dim = len(vocab)
print(word_dim)
random_seed = 10 # or any of your favorite number 
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

8148


In [None]:
f = open("RNN_orthg_everybp_NLP_log.txt", "w")
rnn = Model(f, word_dim, hidden_dim)
losses = rnn.train(X_train, Y_train, learning_rate=0.005, nepoch=25, evaluate_loss_after=1)

In [None]:
with open("RNN_orthg_everybp_NLP_loss_array.txt", "w") as txt_file:
  txt_file.write(", ".join(losses))

In [None]:
rnn.calculate_total_pred_accuray(X_test, Y_test)
f.close()

In [None]:
# Convert the PyTorch tensor to a NumPy array
numpy_matrix = rnn.U.cpu().numpy()
# Save the NumPy array to a CSV file
file_path = 'RNN_everybp_NLP_timeSeries_backThrough_U.csv'
np.savetxt(file_path, numpy_matrix, delimiter=',')

numpy_matrix = rnn.W.cpu().numpy()
file_path = 'RNN_everybp_NLP_timeSeries_backThrough_W.csv'
np.savetxt(file_path, numpy_matrix, delimiter=',')

numpy_matrix = rnn.V.cpu().numpy()
file_path = 'RNN_everybp_NLP_timeSeries_backThrough_V.csv'
np.savetxt(file_path, numpy_matrix, delimiter=',')

numpy_matrix = rnn.b_y.cpu().numpy()
file_path = 'RNN_everybp_NLP_timeSeries_backThrough_by.csv'
np.savetxt(file_path, numpy_matrix, delimiter=',')

numpy_matrix = rnn.b_h.cpu().numpy()
file_path = 'RNN_everybp_NLP_timeSeries_backThrough_bh.csv'
np.savetxt(file_path, numpy_matrix, delimiter=',')

In [None]:
#continue training
learning_rate=0.005
nepoch=25
f = open("RNN_orthg_everybp_NLP_log_part2.txt", "w")
X = X_train
Y = Y_train

num_examples_seen = 200000
losses = []
for epoch in range(nepoch):
    loss = 0
    accuracy = 0

    # For each training example...
    for i in range(len(Y)):
        '''sgd return the loss and accureacy of this sentence/example'''
        lss, acc = rnn.sgd_step(X[i], Y[i], learning_rate)
        loss += lss
        accuracy += acc
        num_examples_seen += 1
        
    #1 place
    #loss, accuracy = self.calculate_total_loss_and_predict_accuracy(X, Y)
    loss /= len(Y)
    accuracy /= len(Y)
    losses.append(str(float(loss)))
    time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch+25, loss))
    f.write("%s: Loss after num_examples_seen=%d epoch=%d: %f\n" % (time, num_examples_seen, epoch+25, loss))
    #accuracy = self.calculate_total_pred_accuray(X, Y)
    print("Prediction accuracy=%f" % (accuracy))
    f.write("Prediction accuracy=%f\n" % (accuracy))
    f.flush()
    # optional: Adjust the learning rate if loss increases
    # if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
    #     learning_rate = learning_rate * 0.5
    #     print("Setting learning rate to %f" % learning_rate)
    sys.stdout.flush()
    
#write the loss result into file
with open("RNN_orthg_everybp_NLP_loss_array_part2.txt", "w") as txt_file:
  txt_file.write(", ".join(losses))
    
#calculate in test dataset  
rnn.calculate_total_pred_accuray(X_test, Y_test)
f.close()

In [None]:
#continue training
learning_rate=0.005
nepoch=25
f = open("RNN_orthg_everybp_NLP_log_part3.txt", "w")
X = X_train
Y = Y_train

num_examples_seen = 400000
losses = []
for epoch in range(nepoch):
    loss = 0
    accuracy = 0

    # For each training example...
    for i in range(len(Y)):
        '''sgd return the loss and accureacy of this sentence/example'''
        lss, acc = rnn.sgd_step(X[i], Y[i], learning_rate)
        loss += lss
        accuracy += acc
        num_examples_seen += 1
        
    #1 place
    #loss, accuracy = self.calculate_total_loss_and_predict_accuracy(X, Y)
    loss /= len(Y)
    accuracy /= len(Y)
    losses.append(str(float(loss)))
    time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch+50, loss))
    f.write("%s: Loss after num_examples_seen=%d epoch=%d: %f\n" % (time, num_examples_seen, epoch+50, loss))
    #accuracy = self.calculate_total_pred_accuray(X, Y)
    print("Prediction accuracy=%f" % (accuracy))
    f.write("Prediction accuracy=%f\n" % (accuracy))
    f.flush()
    # optional: Adjust the learning rate if loss increases
    # if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
    #     learning_rate = learning_rate * 0.5
    #     print("Setting learning rate to %f" % learning_rate)
    sys.stdout.flush()
    
#write the loss result into file
with open("RNN_orthg_everybp_NLP_loss_array_part2.txt", "w") as txt_file:
  txt_file.write(", ".join(losses))
    
#calculate in test dataset  
rnn.calculate_total_pred_accuray(X_test, Y_test)
f.close()