#COS 485 Character level RNN
The source is based on https://github.com/spro/char-rnn.pytorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag

platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision

In [0]:
!pip install -q tqdm
from tqdm import tqdm

In [0]:
import torch
import math
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torchvision.transforms as transforms
from IPython import display
import time


In [0]:
use_cuda = False

# Dataset
Download Shakespeare, preprocess and Display some examples

In [6]:
import requests
import string
import random

all_characters = string.printable
n_characters = len(all_characters)

def DownloadFile(url):
    local_filename = url.split('/')[-1]
    r = requests.get(url)
    return r.text

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor  

def random_training_set(chunk_len, batch_size, file):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, len(file) - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    if use_cuda:
        inp = inp.cuda()
        target = target.cuda()
    return inp, target

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
  
target_url = "https://raw.githubusercontent.com/cos495/code/master/shakespeare.txt"
data = DownloadFile(target_url)
#print(random_training_set(10, 8, data))
print(data[10:100])

zen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


#Model
In this code we use Pytorch already implemented Recurrent Neural Network Cell computation with `nn.RNN` and `nn.LSTM`

In [0]:
# https://github.com/spro/char-rnn.pytorch
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="rnn", n_layers=1):
        super(CharRNN, self).__init__()
        self.model = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, n_layers)
        if model=="lstm":
          self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
          
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        if self.model == "lstm":
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

#Train

###Iinitialize the model

In [0]:
hidden_size = 100
learning_rate = 0.01
cell = "rnn"
n_layers = 2

decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=cell,
    n_layers=n_layers,
)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

if use_cuda:
    decoder.cuda()

In [0]:
n_epochs = 2000
chunk_len = 200
print_every = 100
batch_size = 100

In [0]:
def train(inp, target):
    hidden = decoder.init_hidden(batch_size)
    if use_cuda:
        hidden = hidden.cuda()
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    decoder_optimizer.step()

    return loss.data[0] / chunk_len

# Generate Text

In [0]:
def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))

    if cuda:
        hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = Variable(char_tensor(predicted_char).unsqueeze(0))
        if cuda:
            inp = inp.cuda()

    return predicted

In [12]:
start = time.time()
all_losses = []
loss_avg = 0

print("Training for %d epochs..." % n_epochs)
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(chunk_len, batch_size, data))
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print('loss: ', loss)
        print(generate(decoder, 'Wh', 100, cuda=use_cuda), '\n')

  0%|          | 0/2000 [00:00<?, ?it/s]

Training for 2000 epochs...


  5%|▌         | 100/2000 [01:08<21:44,  1.46it/s]

[1m 8s (100 5%) 1.7881]
loss:  1.7881326293945312
Which seine to hear not for the sink now, struch from to whick capparoughy the hering ipon you
I rest  



 10%|█         | 200/2000 [02:16<20:30,  1.46it/s]

[2m 16s (200 10%) 1.6331]
loss:  1.633067626953125
Which to the more to her:
Thee for blows thearies and it fair the old to the back my do; than my sain  



 15%|█▌        | 300/2000 [03:24<19:17,  1.47it/s]

[3m 24s (300 15%) 1.5988]
loss:  1.5987567138671874
Whyself shrity, with a mother complone.

JULIET:
Ehan with my, she to a kneen to come, what is the sou 



 20%|██        | 400/2000 [04:31<18:04,  1.47it/s]

[4m 31s (400 20%) 1.5535]
loss:  1.5535232543945312
What conserricious look best the rather, with offection
Unnarage?

First Servants to him severate to o 



 25%|██▌       | 500/2000 [05:38<16:55,  1.48it/s]

[5m 38s (500 25%) 1.5560]
loss:  1.5559506225585937
Whone desion out with honour and daughter! O, and I have hastings unclaim in this wit the rest be this 



 30%|███       | 600/2000 [06:45<15:45,  1.48it/s]

[6m 45s (600 30%) 1.5153]
loss:  1.5152717590332032
Why, that 'I hath the creparry, my lady, consent not bastes become to than all
And commons by my heade 



 35%|███▌      | 700/2000 [07:52<14:37,  1.48it/s]

[7m 52s (700 35%) 1.4771]
loss:  1.477056427001953
Why was most dieldon
To seen more to fools, then, somethmanblotted speak with honest and the need.

LU 



 40%|████      | 800/2000 [08:59<13:29,  1.48it/s]

[8m 59s (800 40%) 1.4911]
loss:  1.4910984802246094
What is not my father,
The purpome, and shall stipit.

MENENIUS:
I speak, and thou art thou sister thi 



 45%|████▌     | 900/2000 [10:07<12:22,  1.48it/s]

[10m 7s (900 45%) 1.5156]
loss:  1.5156369018554687
What is thy dires are well his dambles to do.

GREMIO:
Beseive he first! and thou wast be stand thee,  



 50%|█████     | 1000/2000 [11:15<11:15,  1.48it/s]

[11m 15s (1000 50%) 1.4844]
loss:  1.4844248962402344
Which shall she king him to me, I can I have dut's wrong to the title that art day, to have answer eno 



 55%|█████▌    | 1100/2000 [12:23<10:08,  1.48it/s]

[12m 23s (1100 55%) 1.4735]
loss:  1.4734890747070313
Which state upon far speakness'd pass!

AUTOLYCUS:
What what wing to him;
And provoke the profits are  



 60%|██████    | 1200/2000 [13:31<09:01,  1.48it/s]

[13m 31s (1200 60%) 1.5023]
loss:  1.5023362731933594
Who crown, I
Amen; were the morneretences him of him;
Was play incless them he cannot my lord my graci 



 65%|██████▌   | 1300/2000 [14:39<07:53,  1.48it/s]

[14m 39s (1300 65%) 1.4652]
loss:  1.4651730346679688
What, she hath some gentle own nack.

FRIAR LAURENCE:
To levent.
If that profio,
For it it will not in 



 70%|███████   | 1400/2000 [15:46<06:45,  1.48it/s]

[15m 46s (1400 70%) 1.4395]
loss:  1.439539794921875
While and like more no hay issignot the sweet Lord:
Without another!
We till stay; and have hastings,  



 75%|███████▌  | 1500/2000 [16:54<05:38,  1.48it/s]

[16m 54s (1500 75%) 1.4714]
loss:  1.4714414978027344
Wherefore I will I do comes the godst less but the surmeed as hasth
The fire.

LUCIO:
Most father, the 



 80%|████████  | 1600/2000 [18:01<04:30,  1.48it/s]

[18m 1s (1600 80%) 1.4457]
loss:  1.4456869506835937
Whose it, she shall be more on the good,
Than some sprear with sweet work.

CLAUDIO:
Then, but was yel 



 85%|████████▌ | 1700/2000 [19:11<03:23,  1.48it/s]

[19m 11s (1700 85%) 1.4469]
loss:  1.4469297790527345
Whist you is thy lords.

MAMILLIUS:
That we shall must be your crain in sure.;
Why then in prove, do s 



 90%|█████████ | 1800/2000 [20:18<02:15,  1.48it/s]

[20m 18s (1800 90%) 1.4564]
loss:  1.4564405822753905
While here let that house thy master'd how this love stricks, thunkel lose the poor proms,
The people  



 95%|█████████▌| 1900/2000 [21:25<01:07,  1.48it/s]

[21m 25s (1900 95%) 1.4482]
loss:  1.4481936645507814
Whither yet.

KING RICHARD III:
She in proved,
Which that no chilite of sorrow and riege me Murge shal 



100%|██████████| 2000/2000 [22:32<00:00,  1.48it/s]

[22m 32s (2000 100%) 1.4456]
loss:  1.4456103515625
What love and grave?

DUKE VINCENTIO:
That here 'go to the boys you do;
Sab'st both on a presers: poor 







### Let's try sampling with high temperature:

In [13]:
generate(decoder, prime_str="A", temperature= 100, cuda=use_cuda)

"A9\x0c,\rh!a3nBO,cJ1U^#SA|rxu7Ho.-g2|(E.]B*y|8-'3lO-,{=CjXyd'/+@%CVzYuXC)nBrlt~;E`>+\r=:me)+k>u[iHg-:Y5w;j"

### Let's try sampling with low temperature:

In [14]:
generate(decoder, prime_str="A", temperature= 0.001, cuda=use_cuda)

'Abaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaabaab'

### Describe the difference
How do the samples qualitatively change? What does changing the temperature do to distribution of possible outputs?¶

**High temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes. Low temperature will cause the model to make more likely, but also more conservative predictions.** 


### Let's try sampling with reasonable temperature:

In [29]:
generate(decoder, prime_str="A", temperature= 0.25, cuda=use_cuda)

'And shall be his father, and the state to the world, the proceed to the bring the lands the prove the'

**When Temperature is around 0.25, the model generate a reasonable sentence based on the prime_str "A"**

### Example
Insert most meaningful sentence that the network generated, change `prime_str`

In [15]:
generate(decoder, prime_str="why", cuda=use_cuda)

'why, and to my life, sir, we thou hast more good,\nOr shows for a good and king, and high to kill for hi'

### Repeat the sample experiments with nn.LSTM

In [0]:
hidden_size = 100
learning_rate = 0.01
cell = "lstm"
n_layers = 2

decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=cell,
    n_layers=n_layers,
)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

if use_cuda:
    decoder.cuda()

In [0]:
n_epochs = 2000
chunk_len = 200
print_every = 100
batch_size = 100

In [0]:
def train(inp, target):
    hidden = decoder.init_hidden(batch_size)
    if use_cuda:
        hidden = hidden.cuda()
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    decoder_optimizer.step()

    return loss.data[0] / chunk_len

In [35]:
start = time.time()
all_losses = []
loss_avg = 0

print("Training for %d epochs..." % n_epochs)
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(chunk_len, batch_size, data))
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print('loss: ', loss)
        print(generate(decoder, 'Wh', 100, cuda=use_cuda), '\n')

  0%|          | 0/2000 [00:00<?, ?it/s]

Training for 2000 epochs...


  5%|▌         | 100/2000 [02:29<47:12,  1.49s/it]

[2m 28s (100 5%) 1.3378]
loss:  1.337754669189453
Which we hear me for worthing shall bears me to
epoice of his love, sir: a shin'd admitted of other ch 



 10%|█         | 200/2000 [04:55<44:18,  1.48s/it]

[4m 55s (200 10%) 1.3370]
loss:  1.3369523620605468
Which I shall my husband here would and weep the house,
That a brother Randasted of all
water shall le 



 15%|█▌        | 300/2000 [07:21<41:39,  1.47s/it]

[7m 21s (300 15%) 1.3156]
loss:  1.3156253051757814
Where I will says you are they a name to
most death of this head in this acted as the rest.

COMINIUS: 



 20%|██        | 400/2000 [09:45<39:02,  1.46s/it]

[9m 45s (400 20%) 1.3071]
loss:  1.3071195983886719
Which her light no bewing in this part good again
That must by the throne's mission,
And leave us deat 



 25%|██▌       | 500/2000 [12:09<36:28,  1.46s/it]

[12m 9s (500 25%) 1.3012]
loss:  1.3011962890625
Which left with more of this depart
: Countetford at the day, and with stretched much fight.

CORIOLAN 



 30%|███       | 600/2000 [14:32<33:55,  1.45s/it]

[14m 32s (600 30%) 1.3155]
loss:  1.3154891967773437
What
I am glesses to blow betimes, good Lucio,
Thy grave his mid it of a gentleman,
And leave note in  



 35%|███▌      | 700/2000 [16:57<31:29,  1.45s/it]

[16m 57s (700 35%) 1.3299]
loss:  1.3298904418945312
Why heart,
Or how we will dared to the sun, for I
fare will all this is a place, and they then well ag 



 40%|████      | 800/2000 [19:22<29:04,  1.45s/it]

[19m 22s (800 40%) 1.3004]
loss:  1.3003900146484375
What are some from Coriolioring!
They go a word and Loverabe:
The sin that but there! Stand Jood see t 



 45%|████▌     | 900/2000 [21:47<26:37,  1.45s/it]

[21m 47s (900 45%) 1.3193]
loss:  1.3192735290527344
Where is my soul, I warrant your good sore!

PETRUCHIO:
Villain!

PERDITA:
O, then they canst thou has 



 50%|█████     | 1000/2000 [24:10<24:10,  1.45s/it]

[24m 10s (1000 50%) 1.3013]
loss:  1.3013052368164062
Where is the rest:
What thorth of his heart and to a deed; there
I came and sound to ruin you now, wit 



 55%|█████▌    | 1100/2000 [26:37<21:47,  1.45s/it]

[26m 37s (1100 55%) 1.2836]
loss:  1.2836212158203124
What to see slever him appears his life,
The account you and his power: he did myself,
We shall gueds  



 60%|██████    | 1200/2000 [29:03<19:22,  1.45s/it]

[29m 3s (1200 60%) 1.3305]
loss:  1.3304866027832032
Wherefore, mercy commend of a follo,
And rash flood or this death.

LEONTES:
Cousin, my goodly little, 



 65%|██████▌   | 1300/2000 [31:27<16:56,  1.45s/it]

[31m 27s (1300 65%) 1.3111]
loss:  1.3111160278320313
Whith each of the redfright begin,
And that thou art thou wilt be means
Is well-adanted on all prove y 



 70%|███████   | 1400/2000 [33:52<14:31,  1.45s/it]

[33m 52s (1400 70%) 1.2861]
loss:  1.286078643798828
Why, that you like a town being son.

KING RICHARD III:
So more, this is my heart, my lord!

GLOUCESTE 



 75%|███████▌  | 1500/2000 [36:17<12:05,  1.45s/it]

[36m 17s (1500 75%) 1.2830]
loss:  1.282981414794922
Where Margaret death is rantaguain,
To be not, and it is their cleaks of the vice of the war.

PRINCE: 



 80%|████████  | 1600/2000 [38:41<09:40,  1.45s/it]

[38m 40s (1600 80%) 1.3163]
loss:  1.3162733459472655
Where spoke no morning with the reserves of
while, not, with commilumpner than the man: then.

CAMILLO 



 85%|████████▌ | 1700/2000 [41:04<07:14,  1.45s/it]

[41m 4s (1700 85%) 1.2913]
loss:  1.291254425048828
Where all she could think by no Dercesse them obedient his street?

HENRY BOLINGBROKE:
Find, that is a 



 90%|█████████ | 1800/2000 [43:28<04:49,  1.45s/it]

[43m 28s (1800 90%) 1.3074]
loss:  1.3074148559570313
Which who be gone with the time and
lices of good contracted as so both be breath.

Second Murderer:
I 



 95%|█████████▌| 1900/2000 [45:52<02:24,  1.45s/it]

[45m 52s (1900 95%) 1.2940]
loss:  1.2940036010742189
What consent anger is thy house;
Shall I would not shall the prince to enjoy him,
Even no more short!  



100%|██████████| 2000/2000 [48:17<00:00,  1.45s/it]

[48m 17s (2000 100%) 1.2852]
loss:  1.28523681640625
Where is yours; but the tincest!

LARTIUS:
How hate my son:
Thou hast truth to the chance that would I 







### Let's try sampling with high temperature:

In [36]:
generate(decoder, prime_str="A", temperature= 100, cuda=use_cuda)

'AmJ%xuz."7!4dZOcMlG#v+//63Rjre-LT/^"zFMN4{@!3#YDi<\nM!!h\r?Cy~0X$lC7g>jiZ4(1(zCHwu\x0cm*Z+Z/(f_^`>)OU**;m='


### Let's try sampling with low temperature:

In [37]:
generate(decoder, prime_str="A", temperature= 0.001, cuda=use_cuda)

'Aaabaaaaabaaaaabaaaaabaaaaabaaaaabaaaaabaaaaacaaaacaaaabaaaaabaaaaabaaaaabaaaaabaaaaabaaaaacaaaacaaaa'


### Let's try sampling with reasonable temperature:

In [41]:
generate(decoder, prime_str="A", temperature= 0.25, cuda=use_cuda)

'And then I have been a cause to the state\nThe sea and the best of the state of his poor brother\nAnd t'

### Cross Entropy per letter in bits and Perplexity results on prediction

#Cell Types

###Elman Cell Computation
An Elman RNN cell with tanh or ReLU non-linearity.

$h' = \tanh(w_{ih} x + b_{ih}  +  w_{hh} h + b_{hh})$

In [0]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNNCell, self).__init__()
        #Implement initializations
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.W_ih = Variable(nn.random_normal([self.input_size, self.hidden_size],
                                       mean=0, stddev=0.2))
        self.W_hh = Variable(nn.random_normal([self.hidden_size, self.hidden_size],
                                       mean=0, stddev=0.2))
        self.b_ih = Variable(nn.zeros([1, self.hidden_size]))
        self.b_hh = Variable(nn.zeros([1, self.hidden_size]))
        self.V = Variable(nn.random_normal([self.hidden_size, self.output_size],
                                       mean=0, stddev=0.2))
        self.b_hy = Variable(nn.zeros([1, self.output_size]))
        
        
    def forward(self, input, hidden):
      
        hidden = nn.sigmoid(torch.matmul(input, self.W_ih) + torch.matmul(hidden, self.W_hh) + self.b_ih + self.b_hh)
        output = torch.matmul(hidden, self.V) + self.b_hy
        
        #Implement forward pass
        return output, hidden

### LSTM Cell Computation
Implement LSTM cell computation described by the following expression

$i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t c_{(t-1)} + i_t g_t \\
h_t = o_t \tanh(c_t)$


In [0]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(LSTMCell, self).__init__()
        #Implement initializations
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.W_ii = Variable(torch.random_normal([self.hidden_size, self.hidden_size], mean=0, stddev=0.2))
        self.W_hi = Variable(torch.random_normal([self.input_size, self.hidden_size], mean=0, stddev=0.2))
        self.b_ii = Variable(torch.zeros([1, self.hidden_size]))
        self.b_hi = Variable(torch.zeros([1, self.hidden_size]))
        
        self.W_hf = Variable(torch.random_normal([self.hidden_size, self.hidden_size], mean=0, stddev=0.2))
        self.W_if = Variable(torch.random_normal([self.input_size, self.hidden_size], mean=0, stddev=0.2))
        self.b_if = Variable(torch.zeros([1, self.hidden_size]))
        self.b_hf = Variable(torch.zeros([1, self.hidden_size]))
        
        self.W_hg = Variable(torch.random_normal([self.hidden_size, self.hidden_size], mean=0, stddev=0.2))
        self.W_ig = Variable(torch.random_normal([self.input_size, self.hidden_size], mean=0, stddev=0.2))
        self.b_hg = Variable(torch.zeros([1, self.hidden_size]))
        self.b_ig = Variable(torch.zeros([1, self.hidden_size]))
        
        self.W_ho = Variable(torch.random_normal([self.hidden_size, self.hidden_size], mean=0, stddev=0.2))
        self.W_io = Variable(torch.random_normal([self.input_size, self.hidden_size], mean=0, stddev=0.2))
        self.b_ho = Variable(torch.zeros([1, self.hidden_size]))
        self.b_io = Variable(torch.zeros([1, self.hidden_size]))
        
        
    def forward(self, input, hidden, cell):
        #Implement forward pass
        i = nn.sigmoid(torch.matmul(input, self.W_ii) + tf.matmul(hidden, self.W_hi) + self.b_ii + self.b_hi)
        f = nn.sigmoid(torch.matmul(input, self.W_if) + tf.matmul(hidden, self.W_hf) + self.b_if + self.b_hf)
        g = nn.tanh(torch.matmul(input, self.W_ig) + tf.matmul(hidden, self.W_hg) + self.b_ig + self.b_hg)
        o = nn.sigmoid(torch.matmul(input, self.W_io) + tf.matmul(hidden, self.W_ho) + self.b_io + self.b_ho)
        c = torch.mul(f, cell) + torch.mul(i, g)
        h = torch.mul(o, c)
        
        output = o 
        hidden = h
        cell = c
        
        return output, hidden, cell


### LSTM Gates
List all gates that LSTM uses and describe their role

ft : the forget gate layer, It looks the last previous hidden ouptut h(t-1) and the input x, and outputs a number between 0 and 1. A 1
represents “completely keep this” while a 0 represents “completely get rid of this.”

it and gt : the input gate layer, decides which values the network will update. The gate "ft" keep or forget whether the previous info will be kept or not. The gate "it" and "gt" will add to the next output cell. 

ot : the output gate layer, which controls the extent to which the value in the cell is used to compute the output activation of the LSTM unit. 




### Explain how LSTM Cell is different than Simple RNN? (why is it better or worse?)

The difference in structure: Simple RNN has a single neroun like strucutre. But LSTM cell has much more complex structures in a repeating unit, including input gate, forget gate, output gate and memory cell. 

The difference in performance: It is difficult for a standard simple RNN to solve problems that require learning long-term temporal dependencies. LSTM cells include a 'memory cell' that can maintain information in memory for long periods of time. This architecture lets LSTM learn longer-term dependencies. Simple RNNs suffer from vanishing and exploding gradient problems. LSTMs deal with these problems by introducing new gates, such as input and forget gates, which allow for a better control over the gradient flow and enable better preservation of “long-range dependencies”.


#Train CharRNN with your LSTM cell(Extra)