### Import packages and get access to the training text file

In [1]:
### For colab usage
from google.colab import drive
drive.mount('/content/drive')

%cd drive/My Drive/Colab Notebooks/6334/HW4

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/6334/HW4


In [2]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 19.6 MB/s eta 0:00:01[K     |██▉                             | 20 kB 23.3 MB/s eta 0:00:01[K     |████▏                           | 30 kB 12.2 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 9.6 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.4 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 5.9 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 5.7 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 6.4 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 4.8 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 5.2 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 5.2 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 5.2 MB/s eta 0:00:01

In [None]:
import unidecode
import string
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
import argparse
import os

from tqdm import tqdm

In [None]:
### helpers.py

def read_file(filename):
    file = unidecode.unidecode(open(filename).read())
    return file, len(file)

filename = 'medline.0.txt'
file, file_len = read_file(filename)

### Functions to create the model

In [None]:
### train.py
### corrected one mistake in train of using cuda
### corrected one mistake in the last line of train: previous code is loss.data[0] 
### which causes error, so changed to loss

def random_training_set(chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    if cuda:
        inp = inp.cuda()
        target = target.cuda()
    return inp, target

def train(inp, target):
    hidden = decoder.init_hidden(batch_size)

    if cuda: 
        #Can't convert hidden to cuda because hidden is a tuple of tensor, not tensor. 
        #Need to convert it to list, then convert each of the elements to cuda,
        #then convert back to a tuple.
        hidden = list(hidden)
        hidden[0] = hidden[0].cuda()
        hidden[1] = hidden[1].cuda()
        hidden = tuple(hidden)
        #hidden = hidden.cuda()

    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    decoder_optimizer.step()

    return loss / chunk_len

def save():
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)
    print('Saved as %s' % save_filename)

In [None]:
### model.py

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1):
        super(CharRNN, self).__init__()
        self.model = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        if self.model == "gru":
            self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        elif self.model == "lstm":
            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
            #self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, bias=False) #try to set bias as False
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        if self.model == "lstm":
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [None]:
### generate.py
### corrected one mistake of using cuda, the same as in train

def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))

    if cuda:
        #Can't convert hidden to cuda because hidden is a tuple of tensor, not tensor. 
        #Need to convert it to list, then convert each of the elements to cuda,
        #then convert back to a tuple.
        hidden = list(hidden)
        hidden[0] = hidden[0].cuda()
        hidden[1] = hidden[1].cuda()
        hidden = tuple(hidden)
        #hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = Variable(char_tensor(predicted_char).unsqueeze(0))
        if cuda:
            inp = inp.cuda()

    return predicted

In [None]:
### healpers.py

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor

### Training and evaluation

In [None]:
all_characters = string.printable
n_characters = len(all_characters)
n_layers = 2
n_epochs = 2000

hidden_size = 100
batch_size = 100
learning_rate = 0.01
model = "lstm"

chunk_len = 500
print_every = 100
cuda = True

In [None]:
decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=model,
    n_layers=n_layers,
)

if cuda:
    decoder.cuda()

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

In [None]:
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(chunk_len, batch_size))
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print(generate(decoder, 'Wh', 100, cuda=cuda), '\n')

print("Saving...")
save()

  5%|▌         | 100/2000 [02:54<57:06,  1.80s/it] 

[2m 54s (100 5%) 1.7931]
Whanas ang the and in tumury weras in thor. Herkerk (TM) apsine recermes (
SB  - BACF (TQA The in scum 



 10%|█         | 200/2000 [05:46<51:53,  1.73s/it]

[5m 46s (200 10%) 1.4480]
Whytics
MH  - Adult
MH  - Ordags
MH  - Contite Skuakizations D
AU  - Brainei A
FAU - Recang, F
AU  - L 



 15%|█▌        | 300/2000 [08:43<49:31,  1.75s/it]  

[8m 43s (300 15%) 1.2067]
Whuan purtaliate benics and anoticle
      study in the poprotective 
      immunomogressy group prost 



 20%|██        | 400/2000 [11:37<47:50,  1.79s/it]

[11m 37s (400 20%) 1.1709]
Whit, Son KC
FAU - Valgerferraha, Krano
AU  - El M
FAU - Traki, Micha
AU  - Sentotwiel J
FAU - Mida Ba 



 25%|██▌       | 500/2000 [14:29<42:35,  1.70s/it]

[14m 29s (500 25%) 1.1298]
Wheliation compared to the been strement and and PGC/Ried to
      patients with by and the supported  



 30%|███       | 600/2000 [17:24<39:35,  1.70s/it]

[17m 24s (600 30%) 1.1121]
Whin (c) 2012 Elsevier Sequence. In HCD4-IIF-2.07%, and 40 was multiver decreased in the progrative/me 



 35%|███▌      | 700/2000 [20:16<37:50,  1.75s/it]

[20m 16s (700 35%) 1.0367]
Whed Resign, Thow-Flts a symphological such Society OMCA protein, human cancer is evaluate the
      t 



 40%|████      | 800/2000 [23:08<35:25,  1.77s/it]

[23m 8s (800 40%) 1.0687]
Whreary, Young Neoplasms/chemistry/*metabolism
MH  - *Sensity, Department of Urology
MH  - Netherlands 



 45%|████▌     | 900/2000 [26:06<36:13,  1.98s/it]

[26m 6s (900 45%) 1.0298]
Whuie and
      and localized system
      of the NIPC chemitherapy
OT  - ended to complecteria of the 



 50%|█████     | 1000/2000 [29:01<30:03,  1.80s/it]

[29m 1s (1000 50%) 1.0538]
Whes sembard cancer and breast carcinomase Dadds (191; Phy 3). The patients
      provides assessed an 



 55%|█████▌    | 1100/2000 [32:05<27:12,  1.81s/it]

[32m 5s (1100 55%) 1.0394]
Whard Fundestic Acids/*therapeutic use
MH  - Enzymial Protein/analysis/*adverse effects/metabolism/pat 



 60%|██████    | 1200/2000 [34:59<23:21,  1.75s/it]

[34m 59s (1200 60%) 0.9627]
Wh protein, hulm a lings. In
      will four surgical of more levels, in bion and regulated in the eas 



 65%|██████▌   | 1300/2000 [37:53<20:11,  1.73s/it]

[37m 53s (1300 65%) 0.9973]
Wh/TOR JUTOMET- NODS: Taignal Pharmaceutical Cell
      Yenosis, Patterning
MH  - Polyphrom Histone Re 



 70%|███████   | 1400/2000 [40:49<17:07,  1.71s/it]

[40m 49s (1400 70%) 1.0118]
Wh prochemistancy. UL/United States and inactiductic cancer. RESULTS: Aptory of metastases. Patients w 



 75%|███████▌  | 1500/2000 [43:44<14:26,  1.73s/it]

[43m 44s (1500 75%) 1.0189]
Wh
PS  - 1R428-1425 (Print)
IS  - 0022-4514 (Linking)
VI  - 33
IP  - 6
DP  - 2012 Dact 10.88-2662-6000 



 80%|████████  | 1600/2000 [46:39<11:43,  1.76s/it]

[46m 39s (1600 80%) 0.9519]
Whi and cisplatin and intervention of orkers. Herein cells of the decreased in 27 were beas ablable at 



 85%|████████▌ | 1700/2000 [49:43<08:44,  1.75s/it]

[49m 43s (1700 85%) 0.9318]
Wheaus Cancer-Cells, Colon(2)(5) to rectal on the HSVs in woming
      participate also information an 



 90%|█████████ | 1800/2000 [52:38<05:54,  1.77s/it]

[52m 38s (1800 90%) 0.9610]
Whe directs analysis in vistem. METHOD: Bacterial oxidative in some Kapea 
      OS or self-care in ga 



 95%|█████████▌| 1900/2000 [55:33<02:56,  1.76s/it]

[55m 33s (1900 95%) 0.9823]
Wha indicates for negative statistically sample. The improve approaches of the however and medical 
   



100%|██████████| 2000/2000 [58:25<00:00,  1.75s/it]

[58m 25s (2000 100%) 1.0069]
Whar C
FIR - Cattogo, Andrie
IR  - Pertad G
FIR - Roserenst, Rean
IR  - Park A
FIR - Delberg, Taka
AU  

Saving...
Saved as medline.0.pt





In [None]:
generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)

'PMID- 23017638\nOWN - NLM\nSTAT- MEDLINE\nDCOM- 20130313\nLR  - 20171125\nIS  - 1568-7142 (Electronic)\nIS  - 0009-4963 (Linking)\nVI  - 143\nIP  - 1\nDP  - 2013 Feb\nTI  - The simulate of\n      limited the simulating number of the progression and positive\n      not assays, demethy, its receptor response to the increase of CAT, use of the mice.\nPG  - 697-647 CPHD-100254241005 [pii]\nAID - 10.1158/1078-01007-1-123. Epub 2012 Sep 27.\n\nPMID- 22989364\nOWN - NLM\nSTAT- MEDLINE\nDCOM- 20130521\nLR  - 20171108\nIS  - 1539-3627 (Electronic)\nIS  - 0742-1865 (Linking)\nVI  - 18\nIP  - 11\nDP  - 2012 Dec\nTI  - Cancer carbold who various cell lines in Biol directly that\n      someter for helpyly cell cancer and 10 healthy show use of RATC3 delivery of\n      the cisplatin-casurgery therapy of study, we should be the diagnosis\n      interactions, and has \n      with cancer health targeting the pathways and\n      performed to level from large discus symptom in human PSA (3041) and we to