<h1>NLP - Language Modeling</h1>

In [1]:
# Imports
import os.path
from nltk.corpus import treebank
from collections import Counter, defaultdict
from random import choice, choices
import numpy
from math import log2, exp

<h2>Language Modeling with N_Grams - Tokens</h2>

<h3>Prepairing training and testing data</h3>

In [2]:
# Fetching text files
training_file_ids, testing_file_ids  = treebank.fileids()[:150], treebank.fileids()[150:]
training_raw_text, testing_raw_text = treebank.sents(fileids=training_file_ids), treebank.sents(fileids=testing_file_ids)

# Adding tokens to signal the begining and the end of sentences
def add_sentence_boundar(list_of_sentences):
    sb_list = []
    for sentence_list in list_of_sentences:
        sentence_list.insert(0, '<BOS>')
        sentence_list.append('<EOS>')
        sb_list.append(sentence_list)
    return sb_list

# Remove nltk wildcard chars
def removing_wildcard_chars(list_of_sentences):
    end_list = []
    for sentence_list in list_of_sentences:
        for token in sentence_list:
            if '*' in token:
                sentence_list.pop(sentence_list.index(token))
        end_list.append(sentence_list)
    return end_list

# Unpack list of lists to a single list of tokens
def unpack_lists(list_of_sentences):
    return [item for sublist in list_of_sentences for item in sublist]

# Replacing tokens that appear less than 3 times with the <UNK>
def replace_low_freq_tokens(list_of_tokens, occ_times, char='<UNK>'):
    if occ_times < 1:
        occ_times = 1
    for i in list_of_tokens:
        if list_of_tokens.count(i) < occ_times:
            list_of_tokens[list_of_tokens.index(i)] = char
    return list_of_tokens

# Extracting vocabulary
def get_covab(list_of_tokens):
    return list(set(list_of_tokens))

def replace_if_not_in_vocab(list_of_tokens, vocab):
    for i in list_of_tokens:
        if i not in vocab:
            list_of_tokens[list_of_tokens.index(i)] = '<UNK>'
    return list_of_tokens

# Extracting list out of text docs
def txt_to_list(txt_path):
    file_content = open(txt_path, 'r')
    content_list = []
    for line in file_content:
        content_list.append(line.strip())
    return(content_list)

# Creating and storing processed data, if file doesn't already exist
def storing_processed_text():
    if not os.path.exists("training_tokens.txt"):
        training_list = replace_low_freq_tokens(unpack_lists(add_sentence_boundar(removing_wildcard_chars(training_raw_text))),3)
        with open("training_tokens.txt", "w") as outfile:
            outfile.write("\n".join(training_list))

    if not os.path.exists("testing_tokens.txt"):
        testing_list_with_unknown = unpack_lists(add_sentence_boundar(removing_wildcard_chars(testing_raw_text)))
        try:
            vocab = get_covab(training_list)
        except:
            training_list = txt_to_list("training_tokens.txt")
            vocab = get_covab(training_list)
        testing_list = replace_if_not_in_vocab(testing_list_with_unknown, vocab)
        with open("testing_tokens.txt", "w") as outfile:
            outfile.write("\n".join(testing_list))
            
storing_processed_text()
training_list, test_list = txt_to_list("training_tokens.txt"), txt_to_list("testing_tokens.txt")
vocab = get_covab(training_list)

<h3>Bigram Model - Tokens</h3>

In [3]:
# Laplace Smoothing is set to default
class Bigram():
    def __init__(self):
        self.ngram = 2
        self.smoothing_type = 'Laplace'
        self.bigrams = {}
        self.bg_prob_dict = {}
        self.n_unigrams = defaultdict(int)
        self.n_bigrams = defaultdict(lambda: defaultdict(int))
        self.len = []
        self.perp = 0
        
    def fit(self, token_list):
        for i in range(1, len(token_list)):
            curr_bg = (token_list[i-1], token_list[i])
            self.n_bigrams[curr_bg[0]][curr_bg[1]] += 1
            self.n_unigrams[curr_bg[0]] += 1

            
    def update_len(self, token_list):
        for i in range(1, len(token_list)):
            bigram = (token_list[i-1], token_list[i])
            self.len.append(self.n_bigrams[bigram[0]][bigram[1]])
        n_bg = {}
        for token in self.len:
            if token in n_bg:
               n_bg[token] += 1
            else:
               n_bg[token] = 1
        return n_bg
    
    def get_bigram_prob(self, token_list):
            pstar, counter = 0, 0
            for i in range(1, len(token_list)):
                bigram = (token_list[i-1], token_list[i])
                w1, w2 = bigram[0], bigram[1]
                if self.smoothing_type == 'Laplace' or self.smoothing_type == 'Add 1':
                    pstar = 1.0 * self.n_bigrams[w1][w2] / self.n_unigrams[w1]
                    self.bg_prob_dict[bigram] = pstar
                elif self.smoothing_type == 'Good-Turing Discounting':
                    c = self.len[self.n_bigrams[w1][w1]] 
                    n_next = c + 1 
                    cstar = (self.n_bigrams[w2][w2] + 1) * n_next  / c
                    pstar = cstar/len(self.len)
                    self.bg_prob_dict[bigram] = pstar
            if self.smoothing_type == 'Good-Turing Discounting':
                self.update_len(token_list)
            return self.bg_prob_dict
        
    def predict_next_word(self, last_word):
        next_word = {}
        for k in self.bg_prob_dict:
            if k[0] == last_word:
                next_word[k[1]] = self.bg_prob_dict[k]
        k = Counter(next_word)
        high = k.most_common()
        answer = ''
        while answer == '' or answer == '<UNK>':
            try:
                if high[0][0] == '<UNK>' and len(high) == 1:
                    answer = choice(vocab)
                else:
                    answer = choices([i[0] for i in high], weights=[i[1] for i in high], k=1)[0]
            except:
                answer = choice(vocab)
        return answer
    
    def create_sentence_start(self):
        start_list = ['<BOS>']
        start_list.append(self.predict_next_word('<BOS>'))
        return start_list
    
    def generate_sentence(self):
        curr_sentence = self.create_sentence_start()
        last_word = curr_sentence[-1]
        while last_word != '<EOS>':
            new_last_word = self.predict_next_word(last_word)
            curr_sentence.append(new_last_word)
            last_word = new_last_word
        return curr_sentence
    
    def perplexity(self, token_list):
        perp = 1
        prob = 1
        V = len(token_list)
        for i in range(1,len(token_list)):
            bg = (token_list[i-1],token_list[i])
            if bg in self.bg_prob_dict.keys():
               prob += log2(self.bg_prob_dict[bg])
        perp = (-prob/V)*100
        self.perp = perp
        return perp
    

          

<h3>Trigram Model - Tokens</h3>

In [4]:
# Laplace Smoothing is set to default
class Trigram():
    def __init__(self):
        self.ngram = 3
        self.smoothing_type = 'Laplace'
        self.bigrams = {}
        self.trigrams = {}
        self.tg_prob_dict = {}
        self.n_unigrams = defaultdict(int)
        self.n_bigrams = defaultdict(lambda: defaultdict(int))
        self.n_trigrams = defaultdict(lambda: defaultdict(int))
        self.len = []
        self.perp = 0
        
    def fit(self, token_list):
        for i in range(1, len(token_list)):
            curr_tg = (token_list[i-2], token_list[i-1],  token_list[i])
            self.n_trigrams[(curr_tg[0],curr_tg[1])][curr_tg[2]] += 1
            self.n_bigrams[curr_tg[0]][curr_tg[1]] += 1
            self.n_unigrams[curr_tg[0]] += 1

    def update_len(self, token_list):
        for i in range(1, len(token_list)):
            trigram = (token_list[i-2],token_list[i-1], token_list[i])
            self.len.append(self.n_trigrams[(trigram[0],trigram[1])][trigram[2]])
        n_tg = {}
        for token in self.len:
            if token in n_tg:
               n_tg[token] += 1
            else:
               n_tg[token] = 1
        return n_tg
    
    def get_trigram_prob(self, token_list):
        pstar, counter = 0, 0
        for i in range(1, len(token_list)):
            trigram = (token_list[i-2],token_list[i-1], token_list[i])

            w1, w2, w3 = trigram[0], trigram[1], trigram[2]
            if self.smoothing_type == 'Laplace' or self.smoothing_type == 'Add 1':
                pstar = 1.0 * self.n_trigrams[(w1,w2)][w3] / self.n_bigrams[w1][w2]
                self.tg_prob_dict[trigram] = pstar
            elif self.smoothing_type == 'Good-Turing Discounting':
                c = self.len[self.n_trigrams[(trigram[0],trigram[1])][trigram[2]]] 
                n_next = c + 1 
                cstar = (self.n_trigrams[(trigram[0],trigram[1])][trigram[2]] + 1) * n_next  / c
                pstar = cstar/len(self.len)
                self.tg_prob_dict[trigram] = pstar
        if self.smoothing_type == 'Good-Turing Discounting':
            self.update_len(token_list)
        return self.tg_prob_dict

    def predict_next_word(self, last_word):
        next_word = {}
        for k in self.tg_prob_dict:
            if [k[0], k[1]] == last_word:
                next_word[k[2]] = self.tg_prob_dict[k]
        k = Counter(next_word)
        high = k.most_common()
        answer = ''
        while answer == '':
            if len(high) == 0 or high[0][0] == '<UNK>':
                answer = choice(vocab)
                return answer
            else:
                answer = choices([i[0] for i in high], weights=[i[1] for i in high], k=1)[0]
                return answer

    
    def create_sentence_start(self):
        start_list = ['<BOS>']
        bg = Bigram()
        bg.smoothing_type = 'Laplace'
        bg.fit(training_list)
        bg.bigrams = bg.get_bigram_prob(training_list)
        start_list.append(bg.predict_next_word('<BOS>'))
        return start_list
    
    def generate_sentence(self):
        curr_sentence = self.create_sentence_start()
        last_word = curr_sentence
        while last_word != '<EOS>':
            new_last_word = self.predict_next_word(last_word)
            curr_sentence.append(new_last_word)
        return curr_sentence
        
    def perplexity(self, token_list):
        perp = 1
        prob = 1
        V = len(token_list)
        for i in range(2, len(token_list)):
            tg = (token_list[i-2],token_list[i-1], token_list[i])
            if tg in self.tg_prob_dict.keys():
               prob += log2(self.tg_prob_dict[tg])
        perp = (prob*100/V)*-1
        self.perp = perp
        return perp

<h3>Creating and Testing the models</h3>

In [5]:
# Creating a bigram with laplace smoothing and one with good-turing discounting
laplace_bigram, good_turing_bigram = Bigram(), Bigram()
laplace_bigram.smoothing_type, good_turing_bigram.smoothing_type = 'Laplace', 'Good-Turing Discounting'
laplace_bigram.fit(training_list)
laplace_bigram.bigrams = laplace_bigram.get_bigram_prob(training_list)
laplace_bigram.perplexity(test_list)
 
good_turing_bigram.fit(training_list)
good_turing_bigram.update_len(training_list)
good_turing_bigram.bigrams = good_turing_bigram.get_bigram_prob(training_list)
good_turing_bigram.perplexity(test_list)   

# Creating a trigram with laplace smoothing and one with good-turing discounting
laplace_trigram, good_turing_trigram = Trigram(), Trigram()
laplace_trigram.smoothing_type, good_turing_trigram.smoothing_type = 'Laplace', 'Good-Turing Discounting'
laplace_trigram.fit(training_list)
laplace_trigram.trigrams = laplace_trigram.get_trigram_prob(training_list)
laplace_trigram.perplexity(test_list)
good_turing_trigram.fit(training_list)
good_turing_trigram.update_len(training_list)
good_turing_trigram.trigrams = good_turing_trigram.get_trigram_prob(training_list)
good_turing_trigram.perplexity(test_list)
print('Results:')
print('Bigram with Laplace Smoothing perplexity: ' + str(round(laplace_bigram.perp,2)))
print('Bigram with Good-Turing Discounting Smoothing perplexity: ' + str(round(good_turing_bigram.perp,2)))
print('Trigram with Laplace Smoothing perplexity: ' + str(round(laplace_trigram.perp,2)))
print('Trigram with Good-Turing Discounting Smoothing perplexity: ' + str(round(good_turing_trigram.perp,2)))
print('\nThe Trigram model with Laplace Smoothing has the least perplexity and thus is the better performance\n')

print([laplace_bigram.generate_sentence(),laplace_bigram.generate_sentence(), laplace_bigram.generate_sentence()])

Results:
Bigram with Laplace Smoothing perplexity: 326.63
Bigram with Good-Turing Discounting Smoothing perplexity: 1098.92
Trigram with Laplace Smoothing perplexity: 122.19
Trigram with Good-Turing Discounting Smoothing perplexity: 531.95

The Trigram model with Laplace Smoothing has the least perplexity and thus is the better performance

[['<BOS>', 'New', 'York', 'Stock', 'Exchange', 'chairman', ',', 'it', 'with', 'a', 'bad', 'day', "'s", 'office', 'during', 'World', 'Bank', 'of', 'all', ',', 'but', 'would', 'want', 'the', 'provision', ',', 'where', 'the', 'firm', 'and', 'gave', 'up', 'that', '``', 'Markey', ',', "''", '<EOS>'], ['<BOS>', '``', 'designed', '*-2', 'slow', 'program', 'trading', '.', '<EOS>'], ['<BOS>', 'They', 'say', '0', 'it', 'a', 'deal', ',', 'the', 'sixth', 'consecutive', 'months', 'of', '$', '2.2', 'billion', '.', '<EOS>']]


<h2>Language Modeling with N_Grams - Characters</h2>

<h3>Prepairing training and testing data</h3>

In [6]:
vima_training_names = ['VIMA001.TXT', 'VIMA002.TXT', 'VIMA003.TXT', 'VIMA004.TXT', 'VIMA005.TXT', 'VIMA006.TXT', 'VIMA007.TXT','VIMA008.TXT','VIMA009.TXT','VIMA0010.TXT']
vima_testing_names = ['VIMA011.TXT', 'VIMA012.TXT']

def extract_vima_text(name_list):
    vima_texts = [(open('../Assignment-1/assignment1textfiles/sbd/' + i, encoding="utf8").read().split('\n')) for i in os.listdir('../Assignment-1/assignment1textfiles/sbd') if i in name_list]

    end_list = []
    for text in vima_texts:
        for token in text:
            # Adding symbols for start and end of token
            new_token = list(token)
            new_token.insert(0,'@')
            new_token.append('~')
            end_list.append(new_token)
            
    # Unpack lists
    end_list=unpack_lists(end_list)
    # Makes no difference
    # print(len(end_list))
    # replace_low_freq_tokens(end_list, 5, '*')
    # print(len(end_list))
    return end_list

vima_training_list = extract_vima_text(vima_training_names)
vima_testing_list = extract_vima_text(vima_testing_names)

<h3>Trigram Model - Characters</h3>

In [7]:
# Triagram class given as declared above is sufficient
char_trigram_laplace = Trigram()
char_trigram_laplace.smoothing_type = 'Laplace'
char_trigram_laplace.fit(vima_training_list)
char_trigram_laplace.trigrams = char_trigram_laplace.get_trigram_prob(vima_training_list)
char_trigram_laplace.perplexity(vima_testing_list)

char_trigram_gt = Trigram()
char_trigram_gt.smoothing_type = 'Good-Turing Discounting'
char_trigram_gt.fit(vima_training_list)
char_trigram_gt.update_len(vima_training_list)
char_trigram_gt.trigrams = char_trigram_gt.get_trigram_prob(vima_training_list)
char_trigram_gt.perplexity(vima_testing_list)

print('Character Trigram with Laplace Smoothing perplexity: ' + str(round(char_trigram_laplace.perp,2)))
print('Character Trigram with Good-Turing Discounting Smoothing perplexity: ' + str(round(char_trigram_gt.perp,2)))

Character Trigram with Laplace Smoothing perplexity: 226.32
Character Trigram with Good-Turing Discounting Smoothing perplexity: 923.89


<h3>Four-Gram Model - Characters</h3>

In [8]:
class Fourgram():
    def __init__(self):
        self.ngram = 4
        self.smoothing_type = 'Laplace'
        self.bigrams = {}
        self.trigrams = {}
        self.fourgrams = {}
        self.fg_prob_dict = {}
        self.n_unigrams = defaultdict(int)
        self.n_bigrams = defaultdict(lambda: defaultdict(int))
        self.n_trigrams = defaultdict(lambda: defaultdict(int))
        self.n_fourgrams = defaultdict(lambda: defaultdict(int))
        self.len = []
        self.perp = 0
        
    def fit(self, token_list):
        for i in range(1, len(token_list)):
            curr_fg = (token_list[i-3], token_list[i-2],  token_list[i-1],  token_list[i])
            self.n_fourgrams[(curr_fg[0],curr_fg[1],curr_fg[2])][curr_fg[3]] += 1
            self.n_trigrams[(curr_fg[0],curr_fg[1])][curr_fg[2]] += 1
            self.n_bigrams[curr_fg[0]][curr_fg[1]] += 1
            self.n_unigrams[curr_fg[0]] += 1

    def update_len(self, token_list):
        for i in range(1, len(token_list)):
            fourgram = (token_list[i-3], token_list[i-2],token_list[i-1], token_list[i])
            self.len.append(self.n_fourgrams[(fourgram[0],fourgram[1], fourgram[2])][fourgram[3]])
        n_fg = {}
        for token in self.len:
            if token in n_fg:
               n_fg[token] += 1
            else:
               n_fg[token] = 1
        return n_fg
    
    def get_fourgram_prob(self, token_list):
        pstar, counter = 0, 0
        for i in range(1, len(token_list)):
            fourgram = (token_list[i-3], token_list[i-2],token_list[i-1], token_list[i])
            w1, w2, w3, w4 = fourgram[0], fourgram[1], fourgram[2], fourgram[3]
            if self.smoothing_type == 'Laplace' or self.smoothing_type == 'Add 1':
                pstar = 1.0 * self.n_fourgrams[(w1,w2,w3)][w4] / self.n_trigrams[(w1,w2)][w3]
                self.fg_prob_dict[fourgram] = pstar
            elif self.smoothing_type == 'Good-Turing Discounting':
                c = self.len[self.n_fourgrams[(w1,w2,w3)][w4]] 
                n_next = c + 1 
                cstar = (self.n_fourgrams[(w1,w2,w3)][w4] + 1) * n_next  / c
                pstar = cstar/len(self.len)
                self.fg_prob_dict[fourgram] = pstar
        if self.smoothing_type == 'Good-Turing Discounting':
            self.update_len(token_list)
        return self.fg_prob_dict
        
    def perplexity(self, token_list):
        perp = 1
        prob = 1
        V = len(token_list)
        for i in range(3, len(token_list)):
            fg = (token_list[i-3], token_list[i-2],token_list[i-1], token_list[i])
            if fg in self.fg_prob_dict.keys():
               prob += log2(self.fg_prob_dict[fg])
        perp = (prob*100/V)*-1
        self.perp = perp
        return perp

In [9]:
char_trigram_laplace = Trigram()
char_trigram_laplace.smoothing_type = 'Laplace'
char_trigram_laplace.fit(vima_training_list)
char_trigram_laplace.trigrams = char_trigram_laplace.get_trigram_prob(vima_training_list)
char_trigram_laplace.perplexity(vima_testing_list)

char_trigram_gt = Trigram()
char_trigram_gt.smoothing_type = 'Good-Turing Discounting'
char_trigram_gt.fit(vima_training_list)
char_trigram_gt.update_len(vima_training_list)
char_trigram_gt.trigrams = char_trigram_gt.get_trigram_prob(vima_training_list)
char_trigram_gt.perplexity(vima_testing_list)


char_fgram_laplace = Fourgram()
char_fgram_laplace.smoothing_type = 'Laplace'
char_fgram_laplace.fit(vima_training_list)
char_fgram_laplace.trigrams = char_fgram_laplace.get_fourgram_prob(vima_training_list)
char_fgram_laplace.perplexity(vima_testing_list)


char_fgram_gt = Fourgram()
char_fgram_gt.smoothing_type = 'Good-Turing Discounting'
char_fgram_gt.fit(vima_training_list)
char_fgram_gt.update_len(vima_training_list)
char_fgram_gt.fourgrams = char_fgram_gt.get_fourgram_prob(vima_training_list)
char_fgram_gt.perplexity(vima_testing_list)

print('Character Trigram with Laplace Smoothing perplexity: ' + str(round(char_trigram_laplace.perp,2)))
print('Character Trigram with Good-Turing Discounting Smoothing perplexity: ' + str(round(char_trigram_gt.perp,2)))

print('4-gram with Laplace Smoothing perplexity: ' + str(round(char_fgram_laplace.perp,2)))
print('4-gram Trigram with Good-Turing Discounting Smoothing perplexity: ' + str(round(char_fgram_gt.perp,2)))

print('\n4-grams with Laplace smoothing seem to have the best performance.')

Character Trigram with Laplace Smoothing perplexity: 226.32
Character Trigram with Good-Turing Discounting Smoothing perplexity: 923.89
4-gram with Laplace Smoothing perplexity: 148.32
4-gram Trigram with Good-Turing Discounting Smoothing perplexity: 964.05

4-grams with Laplace smoothing seem to have the best performance.


<h2>Predicting Next Charachter with Neural Networks</h2>

In [10]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

<h3>Utils and Data Preparation</h3>

In [11]:
# Utility functions for our NNs

# chars to ints - outputs np.ndarray object
def encode_strs(str):
    chars = tuple(set(str))
    int2char = dict(enumerate((chars)))
    char2int = {ch: ii for ii, ch in int2char.items()}
    encoded = np.array([char2int[ch] for ch in str])     
    return encoded

# one hot encoder to encode our array of nums
def one_hot_encode(arr, n_labels):
    one_hot = np.zeros((arr.size, n_labels), dtype = np.float32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    return one_hot

# Util function to create batches
def get_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size*seq_length
    n_batches = len(arr)//batch_size_total
    
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:,:-1], y[:,-1] = x[:,1:], arr[:, n+seq_length]
        # Accounting for indexerror
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y 
        
# Prepairing the data
training_data = ''.join(extract_vima_text(vima_training_names[:8]))
validation_data = ''.join(extract_vima_text(vima_training_names[8:]))
test_data = ''.join(extract_vima_text(vima_testing_names))

training_data, validation_data, test_data = encode_strs(training_data), encode_strs(validation_data), encode_strs(test_data)
print(training_data[:10])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[27  1  5 83  5 90 63 90 69 93]


<h3>LSTM for character prediction</h3>

In [12]:
class FF_NET(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=10, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.linear = nn.Linear(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        r_output, hidden = self.linear(x, hidden)

        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

class LSTM_RNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=10, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [13]:
# train a net
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr = lr)
    criterion = nn.CrossEntropyLoss()
    
#     val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[0], data[1]
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            h = tuple([each.data for each in h])

            net.zero_grad()
            
            output, h = net(inputs, h)
            
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            # print(counter%print_every)
            # loss stats
            if counter % print_every == 0:

                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()

                for x, y in get_batches(val_data, batch_size, seq_length):
                    
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    val_h = tuple([each.data for each in val_h])
                    inputs, targets = x, y
                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                    val_losses.append(val_loss.item())
                
                net.train()
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                    # "Val Loss: {:.4f}".format(np.mean(val_losses)),
                      "Perplexity: {:.4f}...".format(exp(loss.item())))

# Predict given a net
def predict(net, char):
    x = np.array([net.char2int[str(char)[0]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)
    
    h = tuple([each.data for each in h])
    out, h = net(inputs, h)
    
    p = F.softmax(out, dim =1).data

    top_choice = np.arange(len(net.chars))
        
    p = p.numpy().squeeze()
    char = np.random.choice(top_choice, p=p/p.sum())
    
    return net.int2char[char], h


In [14]:
n_layers=10
# setting the size of layers to 512 to experiment with performance
n_hidden=512
batch_size = 128
seq_length = 100
n_epochs = 10
net = LSTM_RNN(training_data, n_hidden, n_layers)

In [15]:
train(net, [training_data,validation_data], epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=2)

Epoch: 1/10... Step: 2... Loss: 10.9370... Perplexity: 56219.7368...
Epoch: 1/10... Step: 4... Loss: 9.1261... Perplexity: 9191.9850...
Epoch: 2/10... Step: 6... Loss: 6.9943... Perplexity: 1090.3958...
Epoch: 2/10... Step: 8... Loss: 5.1973... Perplexity: 180.7847...
Epoch: 3/10... Step: 10... Loss: 4.2179... Perplexity: 67.8913...
Epoch: 3/10... Step: 12... Loss: 3.6562... Perplexity: 38.7148...
Epoch: 4/10... Step: 14... Loss: 3.4761... Perplexity: 32.3335...
Epoch: 4/10... Step: 16... Loss: 3.4244... Perplexity: 30.7057...
Epoch: 5/10... Step: 18... Loss: 3.5254... Perplexity: 33.9688...
Epoch: 5/10... Step: 20... Loss: 3.4893... Perplexity: 32.7615...
Epoch: 6/10... Step: 22... Loss: 3.4824... Perplexity: 32.5376...
Epoch: 6/10... Step: 24... Loss: 3.3653... Perplexity: 28.9435...
Epoch: 7/10... Step: 26... Loss: 3.3738... Perplexity: 29.1903...
Epoch: 7/10... Step: 28... Loss: 3.3495... Perplexity: 28.4879...
Epoch: 8/10... Step: 30... Loss: 3.4105... Perplexity: 30.2798...
Epoch

In [16]:
# storing and retrieving model
checkpoint = {
    'n_hidden': net.n_hidden,
    'n_layers': net.n_layers,
    'state_dict': net.state_dict(),
    'tokens' : net.chars
}

with open('char_predict_lstm.net', 'wb') as f:
    torch.save(checkpoint, f)

with open('char_predict_lstm.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = LSTM_RNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>