In [132]:
###########

In [108]:
import pandas as pd
import numpy as np
import string, os
import warnings

import torch

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [109]:
curr_dir = './comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

In [110]:
all_headlines = [line for line in all_headlines if line!= "Unknown"]
print(all_headlines[:10])

['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted', 'The Caged Beast Awakens', 'An Ever-Unfolding Story', 'O’Reilly Thrives as Settlements Add Up', 'Mouse Infestation', 'Divide in G.O.P. Now Threatens Trump Tax Plan']


In [111]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

In [112]:
corpus = [clean_text(x) for x in all_headlines]
print(corpus[:14])

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuelas descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted', 'the caged beast awakens', 'an everunfolding story', 'oreilly thrives as settlements add up', 'mouse infestation', 'divide in gop now threatens trump tax plan', 'variety puzzle acrostic', 'they can hit a ball 400 feet but play catch thats tricky', 'in trump country shock at trump budget cuts', 'why is this hate different from all other hate']


In [113]:
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [114]:
tokenizer(corpus[1])

['and', 'now', 'the', 'dreaded', 'trump', 'curse']

In [115]:
tokenizer(corpus[2])

['venezuelas', 'descent', 'into', 'dictatorship']

In [116]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [117]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(corpus), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [118]:
def get_sequence_of_token(corpus):
    input_sequnces = []
    for line in corpus:
        token_list = tokenizer(line)
        for i in range(0,len(token_list)):
            n_gram_sequence = token_list[i:i+4]
            if len(n_gram_sequence) > 3:
                _n_gram_sequence  = vocab(n_gram_sequence)
                input_sequnces.append(_n_gram_sequence)
    return input_sequnces

In [119]:
vocab(['trump', 'descent', 'into', 'dictatorship'])

[11, 1104, 148, 1110]

In [120]:
inp_seq = get_sequence_of_token(corpus=corpus)

In [121]:
inp_seq[:5]

[[185, 18, 1219, 651],
 [18, 1219, 651, 5],
 [1219, 651, 5, 3],
 [651, 5, 3, 1289],
 [5, 3, 1289, 203]]

In [122]:
total_words = len(vocab)
total_words

2423

In [123]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='float32')[y]

In [124]:
def pad_sequences_torch(arr,maxlen):
    seq_len = len(arr)
    padding = (maxlen-seq_len,0)
    pad = torch.nn.ZeroPad2d(padding)
    return pad(torch.tensor(arr))

In [125]:
def generate_padded_sequences(inp_seq):
    input_sequences = np.array(inp_seq)
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label

In [126]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [127]:
len(inp_seq)

3193

In [128]:
class CreateDataset(Dataset):
    def __init__(self, inp_seq):
        self.input_sequences = np.array(inp_seq)
         
    def __getitem__(self, index):
        fetures, label = self.input_sequences[index,:-1],self.input_sequences[index,-1]
        label = to_categorical(label, num_classes=total_words)
        return fetures, label
    
    def __len__(self):
        return len(self.input_sequences)

In [129]:
train_dataset = CreateDataset(inp_seq=inp_seq)
train_loader = DataLoader(train_dataset, shuffle=True)

In [130]:
example = iter(train_loader)
feature, label = next(example)

In [131]:
feature[0]

tensor([195,  42,  22])

In [132]:
torch.argmax(label[0])

tensor(45)

In [133]:
import torch
from torch.optim import Adam
import torch.nn as nn

In [134]:
a = nn.Embedding(total_words, 10)
a

Embedding(2423, 10)

In [135]:
inp_seq[0]

[185, 18, 1219, 651]

In [136]:
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [137]:
train_on_gpu = False

In [138]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.3):
        super(RNN, self).__init__()
             
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True,bidirectional=False)
     
        self.fc = nn.Linear(hidden_dim, output_size)

        self.softmax = nn.Softmax(dim=1)
    
    
    def forward(self, x, hidden1):
        x=x.long()
        
        embeds = self.embedding(x)

        lstm_out, hidden1 = self.lstm1(embeds,hidden1)

        out  = lstm_out[:,-1,:]
        out = self.fc(out)

        out = self.softmax(out)
        

        return out, hidden1
    
    def init_hidden(self,batch_size):
        hidden =  (torch.zeros(self.n_layers,batch_size,self.hidden_dim).to(device),
        torch.zeros(self.n_layers,batch_size,self.hidden_dim).to(device))
        return hidden

In [139]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden1):
   
    h1 = tuple([each.data for each in hidden1])
    rnn.zero_grad()
  
    inputs, targets = inp.to(device), target.to(device)
    
    output, h1 = rnn(inputs, h1)
    
    
  
    flag = torch.argmax(output).item() == torch.argmax(targets)
    # print(flag.item())
    loss = criterion(output, targets)
    loss.backward()
    
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()

    return loss.item(), h1, flag

In [140]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()
    train_len = len(train_loader)
    print("Training for %d epoch(s)..." % n_epochs)
    print("Train len: " + str(train_len))

    for epoch_i in range(1, n_epochs + 1):
        acc = 0
        hidden1 = rnn.init_hidden(batch_size)
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            loss, hidden1, flag = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden1)  
            if flag:
                acc += 1        
            batch_losses.append(loss)

       
        print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
        batch_losses = []
        print(acc)
        print(train_len)
        print(f'Epoch: {epoch_i}, accuracy: {(acc/train_len)*100}')
    return rnn

In [141]:
sequence_length = 4  # of words in a sequence
# Batch Size
batch_size = 1

In [142]:
num_epochs = 10
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 100
# Hidden Dimension
hidden_dim = 150
# Number of RNN Layers
n_layers = 3

show_every_n_batches = 2000

In [79]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.1)
print(rnn)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

RNN(
  (embedding): Embedding(2423, 100)
  (lstm1): LSTM(100, 150, num_layers=3, batch_first=True, dropout=0.1)
  (fc): Linear(in_features=150, out_features=2423, bias=True)
  (softmax): Softmax(dim=1)
)
Training for 10 epoch(s)...
Train len: 3193


KeyboardInterrupt: 

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim

import math

import torchtext

import datasets

from tqdm import tqdm
torch.manual_seed(0)

<torch._C.Generator at 0x14fd496d0>

In [146]:
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
print(dataset)
print(dataset['train'][88]['text'])

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
 This ammunition , and that which I brought with me , was rapidly prepared for use at the Laboratory established at the Little Rock Arsenal for that purpose . As illustrating as the pitiful scarcity of material in the country , the fact may be stated that it was found necessary to use public documents of the State Library for cartridge paper . Gunsmiths were employed or conscripted , tools purchased or impressed , and the repair of the damaged guns I brought with me and about an equal number found at Little Rock commenced at once . But , after inspecting the work and observing the spirit of the men I decided that a garrison 500 strong could hold out against Fitch and that I would lead the remainder - about 1500 - to Gen 'l Rust as 

In [147]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][88]['tokens'])

['this', 'ammunition', ',', 'and', 'that', 'which', 'i', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'laboratory', 'established', 'at', 'the', 'little', 'rock', 'arsenal', 'for', 'that', 'purpose', '.', 'as', 'illustrating', 'as', 'the', 'pitiful', 'scarcity', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'state', 'library', 'for', 'cartridge', 'paper', '.', 'gunsmiths', 'were', 'employed', 'or', 'conscripted', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'i', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'little', 'rock', 'commenced', 'at', 'once', '.', 'but', ',', 'after', 'inspecting', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'men', 'i', 'decided', 'that', 'a', 'garrison', '500', 'strong', 'co

In [148]:
print(tokenized_dataset['train'][4]['tokens'])

['the', 'game', 'began', 'development', 'in', '2010', ',', 'carrying', 'over', 'a', 'large', 'portion', 'of', 'the', 'work', 'done', 'on', 'valkyria', 'chronicles', 'ii', '.', 'while', 'it', 'retained', 'the', 'standard', 'features', 'of', 'the', 'series', ',', 'it', 'also', 'underwent', 'multiple', 'adjustments', ',', 'such', 'as', 'making', 'the', 'game', 'more', 'forgiving', 'for', 'series', 'newcomers', '.', 'character', 'designer', 'raita', 'honjou', 'and', 'composer', 'hitoshi', 'sakimoto', 'both', 'returned', 'from', 'previous', 'entries', ',', 'along', 'with', 'valkyria', 'chronicles', 'ii', 'director', 'takeshi', 'ozawa', '.', 'a', 'large', 'team', 'of', 'writers', 'handled', 'the', 'script', '.', 'the', 'game', "'", 's', 'opening', 'theme', 'was', 'sung', 'by', 'may', "'", 'n', '.']


In [154]:

vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])   

29473
['<unk>', '<eos>', 'the', ',', '.', 'of', 'and', 'in', 'to', 'a']


In [256]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

In [353]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)

In [354]:
train_data[:][0]

tensor([  10, 3872, 3888,  ...,   17, 9072,   63])

In [355]:
class LSTM_NEW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, 
                tie_weights):
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)          
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction, hidden
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [356]:
vocab_size = len(vocab)
embedding_dim = 1024             # 400 in the paper
hidden_dim = 1024                # 1150 in the paper
num_layers = 2                   # 3 in the paper
dropout_rate = 0.65              
tie_weights = True                  
lr = 1e-3      

In [357]:
vocab_size

29473

In [358]:
model = LSTM_NEW(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 47,003,425 trainable parameters


In [359]:
def get_batch(data, seq_len, num_batches, idx):
    src = data[:, idx:idx+seq_len]                   
    target = data[:, idx+seq_len:idx+1+seq_len]             
    return src, target

In [360]:
get_batch(train_data, 3, 1,1)[0].size()[0]

128

In [383]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    correct = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  # The last batch can't be a src
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden) 
        prediction  = prediction[:,-1,:]              
        # prediction = prediction.reshape(batch_size*seq_len, -1)   
        target = target.reshape(-1)

        correct += (target == torch.argmax(prediction,dim=1)).float().sum()
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    print(correct)
    print(len(data))
    print(f'Accuracy: {(100 * (correct / len(data)))}')
    return epoch_loss / num_batches

In [384]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [385]:
n_epochs = 50
seq_len = 3
clip = 0.25
saved = False

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

if saved:
    model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
    test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
    print(f'Test Perplexity: {math.exp(test_loss):.3f}')
else:
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        train_loss = train(model, valid_data, optimizer, criterion, 
                    batch_size, seq_len, clip, device)
        # valid_loss = evaluate(model, valid_data, criterion, batch_size, 
        #             seq_len, device)
        
        # lr_scheduler.step(valid_loss)

        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

        print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
        # print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

tensor(12263.)
128
Accuracy: 9580.46875
	Train Perplexity: 397.413


                                                           

tensor(12860.)
128
Accuracy: 10046.875
	Train Perplexity: 317.132


                                                           

tensor(13253.)
128
Accuracy: 10353.90625
	Train Perplexity: 269.570


                                                          

KeyboardInterrupt: 

In [37]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [51]:
torch.argmax(output[-1][0][0][0])

tensor(241)

In [158]:
vocab.lookup_tokens([18, 1219, 651, 5])

['an', 'expansive', 'view', 'of']

In [8]:
import torch
from torchtext.datasets import AG_NEWS

train_iter = iter(AG_NEWS(split="train"))

In [7]:
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [8]:
next(train_iter)

(3,
 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')

In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [13]:
vocab(['usa', 'is', 'an', 'example'])

[2451, 21, 30, 5297]

In [14]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [15]:
text_pipeline('here is the an example')

[475, 21, 2, 30, 5297]

In [19]:
label_pipeline('2451')

2450

In [20]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


train_iter = AG_NEWS(split="train")
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)