In [58]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data
import matplotlib.pyplot as plt
import pandas as pd
import torchtext as tt
import spacy
import time
from datetime import datetime
import re
import nltk
import string
import pickle

In [59]:
dtype = torch.FloatTensor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
sentences = ''
corpus_size = 2000


In [60]:
# to prevent reuters to be generated right after eg. washington because every article starts with "[location] - reuters ..."
def filter_reuters(txt):
    idx = txt.index('-')
    return txt[idx+1:].strip()

start_time = datetime.fromtimestamp(time.time())
print(f'<{start_time}> training started on {device}')

<2023-05-26 12:44:18.598132> training started on cuda


In [61]:
stop = set(nltk.corpus.stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = nltk.stem.wordnet.WordNetLemmatizer()

def clean(doc):
        stop_free = " ".join([i for i in doc.split() if i not in stop])
        punc_free = "".join(ch for ch in stop_free if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        return normalized

In [62]:
# load data and clean it


df = pd.read_csv("./data/True.csv")
nlp = spacy.load('en_core_web_sm')
tokenizer = tt.data.utils.get_tokenizer('spacy')
np_array = df['text'].values
txt_array = np_array.tolist()[0:corpus_size]
txt_array = [filter_reuters(txt) for txt in txt_array]
sentences = '\n'.join(txt_array)
sentences = sentences.lower()

number_match = re.compile('\b.*[0-9].*\b')
punctuation_match = re.compile('\b.*(\.|\\|\/|,\/|\(|\)).*\b')

for sentence in sentences:
    sentence = number_match.sub(repl="", string=sentence)
    sentence = punctuation_match.sub(repl="", string=sentence)

print(f'number of articles: {len(txt_array)}')
print(f'length of sentences: {len(sentences)}')


# list all the words present in our corpus
word_sequence = tokenizer(sentences)
word_list = list(set(word_sequence))

# split sentences into array of tokens
for sentence in sentences:
    sentence = tokenizer(sentence)

vocab_length = len(word_list)

print(f'{vocab_length} unique tokens')


print(word_sequence[:10])
print(f'word sequence of {len(word_sequence)} words')



number of articles: 2000
length of sentences: 4752429
21319 unique tokens
['the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'u.s']
word sequence of 889009 words


In [63]:
#load pretrained word to vector and vector to word dictionaries

word_to_vector = {}
vector_to_word = {}

with open('./models/w2v.dat', 'rb') as f_path:
    word_to_vector = pickle.load(f_path)
    

with open('./models/v2w.dat', 'rb') as f_path:
    vector_to_word = pickle.load(f_path)

In [64]:
# convert words to vectors


# sentences_as_vectors = np.array([np.array(list(word_to_vector[word])) for word in word_sequence])

word_sequence = word_sequence[:-59]


seq_length = 50
dataX = []
dataY = []
for i in range(0, len(word_sequence) - seq_length, 1):
    seq_in = word_sequence[i:i + seq_length]
    seq_out = word_sequence[i + seq_length]
    dataX.append([word_to_vector[word] for word in seq_in])
    dataY.append(word_to_vector[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)


Total Patterns:  888900


In [65]:
# reshape X to be [samples, time steps, features]
X = torch.tensor(dataX, dtype=torch.float32).reshape([n_patterns, seq_length, 2])
# X = X / float(vocab_length)
y = torch.tensor(dataY)
print(X.shape, y.shape)

torch.Size([888900, 50, 2]) torch.Size([888900, 2])


In [77]:
class ModelOfLastHope(nn.Module):
    def __init__(self, input_size=2):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=256, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, vocab_length)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x

In [81]:
n_epochs = 40
batch_size = 128
model = ModelOfLastHope(input_size=2)
 
optimizer = optim.Adam(model.parameters())
# loss_fn = nn.CrossEntropyLoss(reduction="sum")
loss_fn = nn.CrossEntropyLoss()
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)
 
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        print(X_batch.shape)
        y_pred = model(X_batch)
        print(y_pred.shape)
        print(y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
 
torch.save([best_model, word_to_vector], "word_by_word_model.pth")

torch.Size([128, 50, 2])
torch.Size([128, 21319])
torch.Size([128, 2])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported

In [88]:
# hvorfor kan vi ikke det...?

print(y_pred.shape)
print(y_batch.shape)
print(y_batch)
loss = loss_fn(y_pred, y_batch)

torch.Size([21319])
torch.Size([2])
tensor([ 0.5377, -0.4906])


RuntimeError: size mismatch (got input: [21319], target: [2])