In [34]:
import pandas as pd
import numpy as np
import string, os
import warnings

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import math

from torch.optim import Adam

torch.manual_seed(0)

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [35]:
curr_dir = './comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'ArticlesApril2017' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

In [36]:
all_headlines = [line for line in all_headlines if line!= "Unknown"]
print(all_headlines[:10])

['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted', 'The Caged Beast Awakens', 'An Ever-Unfolding Story', 'O’Reilly Thrives as Settlements Add Up', 'Mouse Infestation', 'Divide in G.O.P. Now Threatens Trump Tax Plan']


In [37]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

In [38]:
corpus = [clean_text(x) for x in all_headlines]
print(corpus[:14])

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuelas descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted', 'the caged beast awakens', 'an everunfolding story', 'oreilly thrives as settlements add up', 'mouse infestation', 'divide in gop now threatens trump tax plan', 'variety puzzle acrostic', 'they can hit a ball 400 feet but play catch thats tricky', 'in trump country shock at trump budget cuts', 'why is this hate different from all other hate']


In [39]:
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [40]:
tokenizer(corpus[1])

['and', 'now', 'the', 'dreaded', 'trump', 'curse']

In [41]:
tokenizer(corpus[2])

['venezuelas', 'descent', 'into', 'dictatorship']

In [42]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [43]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(corpus), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [44]:
def get_sequence_of_token(corpus):
    input_sequnces = []
    for line in corpus:
        token_list = tokenizer(line)
        for i in range(0,len(token_list)):
            n_gram_sequence = token_list[i:i+4]
            if len(n_gram_sequence) > 3:
                _n_gram_sequence  = vocab(n_gram_sequence)
                input_sequnces.append(_n_gram_sequence)
    return input_sequnces

In [45]:
vocab(['trump', 'descent', 'into', 'dictatorship'])

[11, 1104, 148, 1110]

In [46]:
inp_seq = get_sequence_of_token(corpus=corpus)

In [47]:
inp_seq[:5]

[[185, 18, 1219, 651],
 [18, 1219, 651, 5],
 [1219, 651, 5, 3],
 [651, 5, 3, 1289],
 [5, 3, 1289, 203]]

In [48]:
total_words = len(vocab)
total_words

2423

In [49]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='float32')[y]

In [50]:
def pad_sequences_torch(arr,maxlen):
    seq_len = len(arr)
    padding = (maxlen-seq_len,0)
    pad = torch.nn.ZeroPad2d(padding)
    return pad(torch.tensor(arr))

In [51]:
def generate_padded_sequences(inp_seq):
    input_sequences = np.array(inp_seq)
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label

In [52]:
test_len = round(len(inp_seq) * 0.2)
test_len

639

In [53]:
class CreateDataset(Dataset):
    def __init__(self, inp_seq):
        self.input_sequences = np.array(inp_seq)
         
    def __getitem__(self, index):
        fetures, label = self.input_sequences[index,:-1],self.input_sequences[index,-1]
        # label = to_categorical(label, num_classes=total_words)
        return fetures, label
    
    def __len__(self):
        return len(self.input_sequences)

In [54]:
train_dataset = CreateDataset(inp_seq=inp_seq)
train_loader = DataLoader(train_dataset, shuffle=True)

In [55]:
import random
random.seed(12)

In [56]:
test_dataset = CreateDataset(inp_seq=random.sample(inp_seq,test_len))
test_loader = DataLoader(test_dataset, shuffle=True)

In [57]:
example = iter(test_loader)
feature, label = next(example)

In [58]:
feature

tensor([[343,   4, 790]], dtype=torch.int32)

In [59]:
label

tensor([18], dtype=torch.int32)

In [63]:
a = nn.Embedding(total_words, 10)
a

Embedding(2423, 10)

In [64]:
inp_seq[0]

[185, 18, 1219, 651]

In [65]:
vocab.lookup_tokens([185, 18, 1219, 651])

['finding', 'an', 'expansive', 'view']

In [69]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [70]:
# train_on_gpu = True

In [71]:
# import torch.nn as nn

# class RNN(nn.Module):
    
#     def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.3):
#         super(RNN, self).__init__()
             
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)

#         self.vocab_size = vocab_size
#         self.output_size = output_size
#         self.embedding_dim = embedding_dim
#         self.hidden_dim = hidden_dim
#         self.n_layers = n_layers
        
#         self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True,bidirectional=False)
     
#         self.fc = nn.Linear(hidden_dim, output_size)

#         self.softmax = nn.Softmax(dim=1)
    
    
#     def forward(self, x, hidden1):
#         x=x.long()
        
#         embeds = self.embedding(x)

#         lstm_out, hidden1 = self.lstm1(embeds,hidden1)

#         out  = lstm_out[:,-1,:]
#         out = self.fc(out)

#         # out = self.softmax(out)
        

#         return out, hidden1
    
#     def init_hidden(self,batch_size):
#         hidden =  (torch.zeros(self.n_layers,batch_size,self.hidden_dim).to(device),
#         torch.zeros(self.n_layers,batch_size,self.hidden_dim).to(device))
#         return hidden

In [72]:
# def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden1):
   
#     h1 = tuple([each.data for each in hidden1])
#     rnn.zero_grad()
#     rnn.to(device)
  
#     inputs, targets = inp.to(device), target.to(device)
    
#     output, h1 = rnn(inputs, h1)
    
    
  
#     flag = torch.argmax(output).item() == torch.argmax(targets)
#     # print(flag.item())
#     # print(output.size())
#     # print(.size())
#     loss = criterion(output, targets.reshape(1,-1).softmax(dim=1))
#     loss.backward()
    
#     nn.utils.clip_grad_norm_(rnn.parameters(), 5)
#     optimizer.step()

#     return loss.item(), h1, flag

In [73]:
# def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
#     batch_losses = []
    
#     rnn.train()
#     train_len = len(train_loader)
#     print("Training for %d epoch(s)..." % n_epochs)
#     print("Train len: " + str(train_len))

#     for epoch_i in range(1, n_epochs + 1):
#         acc = 0
#         hidden1 = rnn.init_hidden(batch_size)
#         for batch_i, (inputs, labels) in enumerate(train_loader, 1):
#             loss, hidden1, flag = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden1)  
#             if flag:
#                 acc += 1        
#             batch_losses.append(loss)

       
#         print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
#                     epoch_i, n_epochs, np.average(batch_losses)))
#         batch_losses = []
#         print(acc)
#         print(train_len)
#         print(f'Epoch: {epoch_i}, accuracy: {(acc/train_len)*100}')
#     return rnn

In [74]:
# sequence_length = 4  # of words in a sequence
# # Batch Size
# batch_size = 1

In [75]:
# num_epochs = 50
# # Learning Rate
# learning_rate = 0.001

# # Model parameters
# # Vocab size
# vocab_size = len(vocab)
# # Output size
# output_size = vocab_size
# # Embedding Dimension
# embedding_dim = 1024
# # Hidden Dimension
# hidden_dim = 1024
# # Number of RNN Layers
# n_layers = 2

# show_every_n_batches = 2000

In [76]:
# rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
# print(rnn)
# if train_on_gpu:
#     rnn.to(device)

# optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)
# criterion = nn.CrossEntropyLoss()

# # training the model
# trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

New model

In [77]:
torch.cuda.is_available()

True

In [78]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, 
                tie_weights):
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.embedding(src)
        output, hidden = self.lstm(embedding, hidden)          
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction, hidden
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [79]:
vocab_size = len(vocab)
embedding_dim = 128            
hidden_dim = 128               
num_layers = 3             
dropout_rate = 0.15          
tie_weights = True                  
lr = 1e-3

In [80]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [81]:
vocab_size

2423

In [82]:
len(train_loader)

3193

In [107]:
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 708,855 trainable parameters


In [108]:
def train(model, optimizer, criterion, clip, device):
    
    epoch_loss = 0
    correct = 0
    model.train()
   

    hidden = model.init_hidden(1, device)
    for batch_i, (inputs, labels) in enumerate(train_loader, 1):
    
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)
        
        src, target = inputs, labels
        src, target = src.to(device), target.to(device)
    
      
        prediction, hidden = model(src, hidden) 
        prediction  = prediction[:,-1,:]              
       
        target = target.reshape(-1).to(torch.int64)
       
        correct += (target == torch.argmax(prediction,dim=1)).sum()
      
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    print(correct)
    print(f'Accuracy: {(100 * (correct /len(train_loader)))}')
    return epoch_loss

In [113]:
def evaluate(model, criterion, device):

    epoch_loss = 0
    model.eval()

    hidden = model.init_hidden(1, device)
    correct =0

    with torch.no_grad():
        for batch_i, (inputs, labels) in enumerate(test_loader, 1):
            hidden = model.detach_hidden(hidden)
            src, target = inputs, labels
            src, target = src.to(device), target.to(device)
         
            prediction, hidden = model(src, hidden) 
            prediction  = prediction[:,-1,:]              
            
            target = target.reshape(-1).to(torch.int64)

            correct += (target == torch.argmax(prediction,dim=1)).sum()
           
            loss = criterion(prediction, target)
            epoch_loss += loss.item()
            
    print(correct)
    print(f'Accuracy: {(100 * (correct /len(test_loader)))}')
    return epoch_loss

In [110]:
device

'cuda:0'

In [111]:

n_epochs = 100
seq_len = 3
clip = 0.25
saved = False

# lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.3, patience=20)

if saved:
    model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
    test_loss = evaluate(model, criterion, device)
    print(f'Test Perplexity: {math.exp(test_loss):.3f}')
else:
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        train_loss = train(model, optimizer, criterion, clip, device)
        # valid_loss = evaluate_new(model, None, criterion, batch_size, 
        #             seq_len, device)
        
        # lr_scheduler.step(valid_loss)

        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'best-val-lstm_lm.pt')
        try:
            print(f'\tTrain Perplexity: {math.exp(train_loss):.7f}')
            # print(f'\tValid Perplexity: {math.exp(valid_loss):.7f}')
        except OverflowError:
            print(f'\tTrain Perplexity: {math.inf}')
            # print(f'\tValid Perplexity: {math.inf}')

tensor(93, device='cuda:0')
Accuracy: 2.91262149810791
	Train Perplexity: inf
tensor(113, device='cuda:0')
Accuracy: 3.5389914512634277
	Train Perplexity: inf
tensor(119, device='cuda:0')
Accuracy: 3.726902484893799
	Train Perplexity: inf
tensor(117, device='cuda:0')
Accuracy: 3.6642656326293945
	Train Perplexity: inf
tensor(132, device='cuda:0')
Accuracy: 4.134043216705322
	Train Perplexity: inf
tensor(162, device='cuda:0')
Accuracy: 5.073598384857178
	Train Perplexity: inf
tensor(158, device='cuda:0')
Accuracy: 4.948324680328369
	Train Perplexity: inf
tensor(177, device='cuda:0')
Accuracy: 5.5433759689331055
	Train Perplexity: inf
tensor(205, device='cuda:0')
Accuracy: 6.420294284820557
	Train Perplexity: inf
tensor(227, device='cuda:0')
Accuracy: 7.109301567077637
	Train Perplexity: inf
tensor(257, device='cuda:0')
Accuracy: 8.048856735229492
	Train Perplexity: inf
tensor(276, device='cuda:0')
Accuracy: 8.643908500671387
	Train Perplexity: inf
tensor(301, device='cuda:0')
Accuracy: 

In [114]:
valid_loss = evaluate(model, criterion, device)

tensor(562, device='cuda:0')
Accuracy: 87.94992065429688


In [122]:
def predict_next_word(str_original):
    str = tokenizer(clean_text(str_original))
    print(str)
    hidden = model.init_hidden(1, device)
    with torch.no_grad():  
        hidden = model.detach_hidden(hidden)
        src = torch.tensor([vocab(str)]).to(torch.int32).to(device)
        prediction, hidden = model(src, hidden) 
        prediction  = prediction[:,-1,:]              
        print(f'{str_original} --> {vocab.lookup_tokens([torch.argmax(prediction,dim=1).item()])[0]}')


In [123]:
predict_next_word('search of king')

['search', 'of', 'king']
search of king --> solomons


In [124]:
predict_next_word('A National Civics')

['a', 'national', 'civics']
A National Civics --> exam


In [126]:
predict_next_word('Picks for Education')

['picks', 'for', 'education']
Picks for Education --> dept


In [127]:
predict_next_word(' Center of the')

['center', 'of', 'the']
 Center of the --> universe


In [128]:
predict_next_word('Supreme Court as')

['supreme', 'court', 'as']
Supreme Court as --> tragic
