In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as tdata
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_pickle('../data/text/dataset_embedded.pkl')
df = df.sample(frac=0.001)

In [3]:
class Dataset(tdata.Dataset):
  def __init__(self, df, list_IDs):
    self.df = df.loc[list_IDs]
    self.list_IDs = list(self.df.index)
    self.langs = list(df['lang'].unique())

  def __len__(self):
    return len(self.list_IDs)
  
  def lang_to_onehot(self, lang):
    lang = self.langs.index(lang)
    onehot = torch.zeros(len(self.langs))
    onehot[lang] = 1
    return onehot

  def __getitem__(self, index):
    idx = self.list_IDs[index]
    row = self.df.loc[idx]
    
    article_count = torch.tensor([row['article_count']])
    lang = self.lang_to_onehot(row['lang'])
    date = torch.tensor([row['event_date']])

    title_embed = torch.tensor(row['title_embed'][0])
    #summary_embed = torch.tensor(row['summary_embed'][0])

    X = torch.cat((lang, date, title_embed,))

    return X, article_count

In [4]:
epochs = 200
params = {
  'batch_size': 64,
  'shuffle': True,
  'num_workers': 4,
}

In [5]:
train_ids, test_ids = train_test_split(df.index, test_size=0.2, random_state=42)

train_dataset = Dataset(df, train_ids)
test_dataset = Dataset(df, train_ids) # same as train
# test_dataset = Dataset(df, test_ids)

train_generator = tdata.DataLoader(train_dataset, **params)
test_generator = tdata.DataLoader(test_dataset, **params)

In [6]:
class Model(nn.Module):
  def __init__(self, input_size):
    super(Model, self).__init__()

    self.linear1 = nn.Linear(input_size, 128)
    self.linear2 = nn.Linear(128, 64)
    self.linear3 = nn.Linear(64, 32)
    self.linear4 = nn.Linear(32, 1)

  def forward(self, x):
    
    x = F.relu(self.linear1(x))
    x = F.relu(self.linear2(x))
    x = F.relu(self.linear3(x))
    x = F.relu(self.linear4(x))

    return x

In [7]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
model = Model(1*768 + 4 + 1)
criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

model = model.to(device)

In [10]:
for epoch in range(epochs):
  model.train()
  for x, y in train_generator:
    x, y = x.to(device), y.to(device)

    optimizer.zero_grad()

    outputs = model(x.float())
    loss = criteria(outputs, y.float())
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.no_grad():
    test_loss = []
    
    for x, y in test_generator:
      x, y = x.to(device), y.to(device)

      outputs = model(x.float())
      loss = criteria(outputs, y.float())
      test_loss.append(loss.item())

  print('Epoch: {}, Loss: {}'.format(epoch, np.mean(test_loss)))

Epoch: 0, Loss: 585.3194452921549
Epoch: 1, Loss: 721.2002360026041
Epoch: 2, Loss: 685.7989908854166
Epoch: 3, Loss: 609.0653940836588
Epoch: 4, Loss: 720.4079793294271
Epoch: 5, Loss: 724.5191040039062
Epoch: 6, Loss: 824.8701578776041
Epoch: 7, Loss: 957.0680745442709
Epoch: 8, Loss: 1050.7671712239583
Epoch: 9, Loss: 813.27197265625
Epoch: 10, Loss: 591.6645456949869
Epoch: 11, Loss: 589.4091440836588
Epoch: 12, Loss: 609.3223368326823
Epoch: 13, Loss: 597.2530873616537
Epoch: 14, Loss: 699.2029418945312
Epoch: 15, Loss: 699.3956502278646
Epoch: 16, Loss: 593.1491139729818
Epoch: 17, Loss: 593.0848744710287
Epoch: 18, Loss: 748.7218424479166
Epoch: 19, Loss: 609.4793599446615
Epoch: 20, Loss: 622.6905822753906
Epoch: 21, Loss: 628.2363077799479
Epoch: 22, Loss: 1056.8125
Epoch: 23, Loss: 725.0044352213541
Epoch: 24, Loss: 589.2949447631836
Epoch: 25, Loss: 695.1917521158854
Epoch: 26, Loss: 587.938850402832
Epoch: 27, Loss: 998.4147135416666
Epoch: 28, Loss: 953.2353515625
Epoch: 2

In [14]:
# Generate predictions and save them in a pandas dataframe along with their true values
model.eval()
with torch.no_grad():
  true = []
  prd = []
  for x, y in test_generator:
    x, y = x.to(device), y.to(device)

    outputs = model(x.float())
    true.append(y.cpu().numpy())
    prd.append(outputs.cpu().numpy())

true = np.concatenate(true)
prd = np.concatenate(prd)
outputs = pd.DataFrame({'true': true.flatten(), 'pred': prd.flatten()})


In [15]:
outputs[outputs['pred'] > 0]

Unnamed: 0,true,pred


In [16]:
outputs.shape

(155, 2)