In [132]:
###########

In [2]:
import pandas as pd
import numpy as np
import string, os
import warnings

import torch

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
curr_dir = './comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

In [4]:
all_headlines = [line for line in all_headlines if line!= "Unknown"]
print(all_headlines[:10])

['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted', 'The Caged Beast Awakens', 'An Ever-Unfolding Story', 'O’Reilly Thrives as Settlements Add Up', 'Mouse Infestation', 'Divide in G.O.P. Now Threatens Trump Tax Plan']


In [5]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

In [6]:
corpus = [clean_text(x) for x in all_headlines]
print(corpus[:14])

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuelas descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted', 'the caged beast awakens', 'an everunfolding story', 'oreilly thrives as settlements add up', 'mouse infestation', 'divide in gop now threatens trump tax plan', 'variety puzzle acrostic', 'they can hit a ball 400 feet but play catch thats tricky', 'in trump country shock at trump budget cuts', 'why is this hate different from all other hate']


In [8]:
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [9]:
tokenizer(corpus[1])

['and', 'now', 'the', 'dreaded', 'trump', 'curse']

In [10]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [11]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(corpus), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [12]:
def get_sequence_of_token(corpus):
    input_sequnces = []
    for line in corpus:
        token_list = tokenizer(line)
        for i in range(1,len(token_list)):
            n_gram_sequence = token_list[:i+1]
            _n_gram_sequence  = vocab(n_gram_sequence)
            input_sequnces.append(_n_gram_sequence)
    return input_sequnces

In [13]:
inp_seq = get_sequence_of_token(corpus=corpus)

In [14]:
total_words = len(vocab)
total_words

2423

In [15]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='float32')[y]

In [16]:
def pad_sequences_torch(arr,maxlen):
    seq_len = len(arr)
    padding = (maxlen-seq_len,0)
    pad = torch.nn.ZeroPad2d(padding)
    return pad(torch.tensor(arr))

In [17]:
def generate_padded_sequences(inp_seq):
    max_sequence_len = max([len(x) for x in inp_seq])
    res_1 = [pad_sequences_torch(i,max_sequence_len) for i in inp_seq]
    input_sequences = np.array(res_1)
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [18]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [19]:
len(inp_seq)

4806

In [20]:
class CreateDataset(Dataset):
    def __init__(self, inp_seq):
        self.inp_seq = inp_seq
        max_sequence_len = max([len(x) for x in self.inp_seq])
        res_1 = [pad_sequences_torch(i,max_sequence_len) for i in self.inp_seq]
        self.input_sequences = np.array(res_1)
         
    def __getitem__(self, index):
        
        fetures, label = self.input_sequences[index,:-1],self.input_sequences[index,-1]
        label = to_categorical(label, num_classes=total_words)
        return fetures, label
    
    def __len__(self):
        return len(self.inp_seq)

In [21]:
train_dataset = CreateDataset(inp_seq=inp_seq)
train_loader = DataLoader(train_dataset, shuffle=True)

In [22]:
example = iter(train_loader)
feature, label = next(example)

In [23]:
feature[0]

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  59, 396,   7])

In [24]:
label[0]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [26]:
pred, label, max_seq = generate_padded_sequences(inp_seq)

In [27]:
pred

array([[   0,    0,    0, ...,    0,    0,  185],
       [   0,    0,    0, ...,    0,  185,   18],
       [   0,    0,    0, ...,  185,   18, 1219],
       ...,
       [   0,    0,    0, ...,  101, 2180,   57],
       [   0,    0,    0, ..., 2180,   57,  347],
       [   0,    0,    0, ...,   57,  347,   95]])

In [28]:
label.shape

(4806, 2423)

In [29]:
import torch
from torch.optim import Adam
import torch.nn as nn

In [30]:
a = nn.Embedding(total_words, 10)
a

Embedding(2423, 10)

In [31]:
inp_seq[0]

[185, 18]

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [34]:
train_on_gpu = False

In [42]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
      
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
        
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
     
        self.fc = nn.Linear(hidden_dim, output_size)
    
    
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x=x.long()
        
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(lstm_out)
        
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1]

        return out, hidden
    
    
    def init_hidden(self, batch_size):
        weights = next(self.parameters()).data
        if(train_on_gpu):
            hidden = (weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), 
                     weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weights.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                     weights.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [43]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
   
    h = tuple([each.data for each in hidden])
    
    rnn.zero_grad()
  
    inputs, targets = inp.to(device), target.to(device)
    
    output, h = rnn(inputs, h)
    
    loss = criterion(output, targets)
    
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()

    return loss.item(), h

In [44]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            batch_losses.append(loss)

            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []
    return rnn

In [45]:
sequence_length = 18  # of words in a sequence
# Batch Size
batch_size = 1

In [65]:
num_epochs = 10
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 200
# Hidden Dimension
hidden_dim = 250
# Number of RNN Layers
n_layers = 4

show_every_n_batches = 500

In [66]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
print(rnn)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

RNN(
  (embedding): Embedding(2423, 200)
  (lstm): LSTM(200, 250, num_layers=4, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=250, out_features=2423, bias=True)
)
Training for 10 epoch(s)...
Epoch:    1/10    Loss: 0.01564548803237267

Epoch:    1/10    Loss: 0.0036303741591982544

Epoch:    1/10    Loss: 0.003786800573579967

Epoch:    1/10    Loss: 0.0038206388072576374

Epoch:    1/10    Loss: 0.003827467465540394

Epoch:    1/10    Loss: 0.0037638499792665243

Epoch:    1/10    Loss: 0.003715909888036549

Epoch:    1/10    Loss: 0.003695638798875734

Epoch:    1/10    Loss: 0.003617398198926821

Epoch:    2/10    Loss: 0.0034660280694568035

Epoch:    2/10    Loss: 0.003443441507406533

Epoch:    2/10    Loss: 0.0034820756090339273

Epoch:    2/10    Loss: 0.0034912491112481805

Epoch:    2/10    Loss: 0.0035138779666740445

Epoch:    2/10    Loss: 0.0033886819726321846

Epoch:    2/10    Loss: 0.0034240900070872157

Epoch:    2/10    Loss: 0.003537353924708441

Epoch: 

In [67]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [68]:
hidden = rnn.init_hidden(batch_size)
h = tuple([each.data for each in hidden])


In [96]:
token_list = tokenizer('the new york')
text  = vocab(token_list)
text = pad_sequences_torch(text,18)
text

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2, 12, 31])

In [97]:
output = rnn(text.reshape(1,-1),h)

In [98]:
torch.argmax(output[-1][0][0][0])

tensor(242)

In [100]:
vocab.lookup_tokens([242])

['cant']