In [None]:
!python -m spacy download en_core_web_sm

import pandas as pd
import torchtext as tt
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F




In [None]:
# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output[:, -1, :])
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_csv("./data/True.csv")

nlp = spacy.load('en_core_web_sm')
tokenizer = tt.data.utils.get_tokenizer('spacy')

df.head()
# df['lower'] = df['text'].apply(lambda x: x.lower())
# df['tokenized'] = df['text'].apply(lambda x: tokenizer(x))

np_array = df['text'].values

txt_array = np_array.tolist()
print(txt_array[0])

tokens = tokenizer(' '.join(txt_array))



max_size = 0
for sentence in txt_array:
    _tokens = tokenizer(sentence)
    length = len(_tokens)
    if length > max_size:
        max_size = length




In [None]:
print(tokens[0:20])

print(f'longest article contains {max_size} tokens')
unique_tokens = ["<pad>"]
unique_tokens.extend([item for item in set(tokens)])

print(type(tokens[0]))

word2idx = {word: idx for idx, word in enumerate(unique_tokens)}
idx2word = {idx: word for word, idx in word2idx.items()}

input_size = len(unique_tokens)
output_size = input_size

learning_rate = 0.01
num_epochs = 10
batch_size = 1
hidden_size = 128

In [None]:
rnn = RNN(input_size, hidden_size, output_size)
criterion = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr=learning_rate)

# RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.
print(word2idx["<pad>"])

def get_w2i_tensor(sentence):
    tokens = [word2idx[word] for word in tokenizer(sentence)]
    for i in range(max_size - len(tokens) + 1):
        tokens.append(0)
    return torch.tensor(tokens).to(device)

for epoch in range(num_epochs):
    hidden = rnn.init_hidden(batch_size)
    for idx, sentence in enumerate(txt_array):
        # rnn.zero_grad()
        
        # torch.autograd.set_detect_anomaly(True)

        inputs = get_w2i_tensor(sentence)
        targets = get_w2i_tensor(sentence)

        output, hidden = rnn(inputs, hidden)

        loss = criterion(output.view(-1, output_size), targets.view(-1))

        loss.backward(retain_graph=True)
        
        optimizer.step()
        
    # if epoch % 10 == 0:
    #     print(f'Epoch [{epoch}/{num_epochs}], Loss: {loss.item}')

    print(f'Epoch [{epoch}/{num_epochs}], Loss: {loss.item}')

In [None]:
# Use the trained RNN to generate text
seed_text = "hello"
max_len = 10

hidden = rnn.init_hidden(batch_size)
seed_words = seed_text.split()
for word in seed_words:
    input = torch.tensor([[word2idx[word]]])
    output, hidden = rnn(input, hidden)

    # Discard the output and use the hidden state as the initial state for the next input
    hidden = (hidden[0].detach(), hidden[1].detach())

# Generate text
generated_text = seed_words
for i in range(max_len):
    input = torch.tensor([[word2idx[word]]])
    output, hidden = rnn(input, hidden)

    # Sample the next word from the output distribution
    probs = torch.exp(output)
    _, top_word_idx = torch.topk(probs, k=1)
    top_word_idx = top_word_idx.item()
    next_word = idx2word[top_word_idx]

    generated_text.append(next_word)

print(' '.join(generated_text))