In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [2]:
class Net(nn.Module):

    def __init__(self, n_langs=4, llm_embed_size=768):
        super(Net, self).__init__()
        self.input_dim = n_langs + 2*llm_embed_size + 1 # timestamp

        # Define layers
        self.fc1 = nn.Linear(in_features=self.input_dim, out_features=1024) # First hidden layer
        self.fc2 = nn.Linear(1024, 512) # Second hidden layer
        self.fc3 = nn.Linear(512, 128) # Second hidden layer
        self.fc4 = nn.Linear(128, 64) # Second hidden layer
        self.fc5 = nn.Linear(64, 1) # Output layer

        self.fakju6 = nn.Linear(5, 1)

    def forward(self, x):
        
        # Add hidden layers with relu activation function
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = F.relu(self.fc3(x))
        # x = F.relu(self.fc4(x))
        # x = F.relu(self.fc5(x))

        x = self.fakju6(x)

        return x

# Data

In [3]:
import pandas as pd

df = pd.read_pickle('../data/text/dataset_embedded.pkl')
df.head()

Unnamed: 0_level_0,article_count,event_date,title_embed,summary_embed,lang_deu,lang_eng,lang_spa,lang_zho
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
e_11,7,1387325000.0,"[[0.47470707, -0.08501352, 0.26899937, -0.3635...","[[0.3377615, -0.26158097, 0.3140225, -0.212071...",False,True,False,False
e_10,221,1387411000.0,"[[0.17094071, -0.18888026, 0.28712985, -0.3610...","[[0.1786896, -0.11662727, 0.19326286, -0.20948...",False,True,False,False
e_13,9,1387498000.0,"[[0.2537402, -0.032281302, 0.37904784, -0.3181...","[[0.026874868, -0.09318099, 0.03552014, -0.026...",False,True,False,False
e_12,1,1387066000.0,"[[0.23880291, 0.03649398, 0.32137018, -0.17099...","[[0.46094257, -0.36103615, 0.31917268, -0.6018...",False,True,False,False
e_15,8,1387325000.0,"[[0.3985864, -0.06734807, 0.40732777, -0.46121...","[[0.5053371, -0.062929116, 0.27972195, -0.4317...",False,True,False,False


In [4]:
# randomly drop 75% of the data
df = df.head(10)

In [5]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

train_df, test_df = train_test_split(df, test_size=0.2)

In [6]:
lang_cols = ['lang_eng', 'lang_deu', 'lang_spa', 'lang_zho']
x_cols = ['event_date', 'title_embed', 'summary_embed']
y_cols = ['article_count']
def get_xy(df):
    date = df['event_date'].values
    langs = df[lang_cols].values
    langs = torch.tensor(langs, dtype=torch.bool)
    date = torch.tensor(date, dtype=torch.float64)
    date = date.unsqueeze(1)

    #title_embeds = torch.tensor(df['title_embed'].values.tolist(), dtype=torch.float64).reshape(-1, 768)
    #summary_embeds = torch.tensor(df['summary_embed'].values.tolist(), dtype=torch.float64).reshape(-1, 768)

    #X = torch.cat((date, langs, title_embeds, summary_embeds), dim=1)
    X = torch.cat((date, langs), dim=1)
    Y = df[y_cols].to_numpy(dtype=np.float32)
    return X.float(), torch.tensor(Y, dtype=torch.float32)

# Train

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

In [8]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [9]:
model = Net(llm_embed_size=0, n_langs=0)
criterion = nn.MSELoss() # RMSLELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-6)

In [10]:
epochs = 100
batch_size = 128

In [11]:
# train_X, train_Y = get_xy(train_df)
# test_X, test_Y = get_xy(test_df)

test_X = torch.tensor([
    [1, 1, 1, 1, 1],
    [2, 2, 2, 2, 2],
    [3, 3, 3, 3, 3],
    [4, 4, 4, 4, 4],
    [5, 5, 5, 5, 5],
    [6, 6, 6, 6, 6]
], dtype=torch.float32)
test_Y = torch.tensor([
    [1],
    [2],
    [3],
    [4],
    [5],
    [6]
], dtype=torch.float32)

# train_ds = TensorDataset(train_X, train_Y)
test_ds = TensorDataset(test_X, test_Y)
train_ds = test_ds

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [12]:
for epoch in range(epochs):
    model.train()
    train_loss_accumulator = 0.0
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        output = model(x)
        loss_value = criterion(output, y)
        train_loss_accumulator += loss_value.item()  # Sum the loss for each batch

        loss_value.backward()
        optimizer.step()

    average_train_loss = train_loss_accumulator / len(train_dl)  # Calculate the average loss

    model.eval()
    total_test_loss = 0.0
    with torch.no_grad():
        for x, y in test_dl:
            x, y = x.to(device), y.to(device)
            output = model(x)
            total_test_loss += criterion(output, y).item()
    average_test_loss = total_test_loss / len(test_dl)

    print(f'Epoch {epoch}: train loss {average_train_loss:.4f}, test loss {average_test_loss:.4f}')

Epoch 0: train loss 3.4276, test loss 3.4275
Epoch 1: train loss 3.4275, test loss 3.4274
Epoch 2: train loss 3.4274, test loss 3.4273
Epoch 3: train loss 3.4273, test loss 3.4273
Epoch 4: train loss 3.4273, test loss 3.4272
Epoch 5: train loss 3.4272, test loss 3.4271
Epoch 6: train loss 3.4271, test loss 3.4270
Epoch 7: train loss 3.4270, test loss 3.4270
Epoch 8: train loss 3.4270, test loss 3.4269
Epoch 9: train loss 3.4269, test loss 3.4268
Epoch 10: train loss 3.4268, test loss 3.4267
Epoch 11: train loss 3.4267, test loss 3.4267
Epoch 12: train loss 3.4267, test loss 3.4266
Epoch 13: train loss 3.4266, test loss 3.4265
Epoch 14: train loss 3.4265, test loss 3.4264
Epoch 15: train loss 3.4264, test loss 3.4264
Epoch 16: train loss 3.4264, test loss 3.4263
Epoch 17: train loss 3.4263, test loss 3.4262
Epoch 18: train loss 3.4262, test loss 3.4261
Epoch 19: train loss 3.4261, test loss 3.4261
Epoch 20: train loss 3.4261, test loss 3.4260
Epoch 21: train loss 3.4260, test loss 3.425

# Evaluation

In [13]:
# Generate predictions
model.eval()
with torch.no_grad():
    test_X, test_Y = get_xy(test_df)
    test_X, test_Y = test_X.to(device), test_Y.to(device)
    test_output = model(test_X)
    test_loss = criterion(test_output, test_Y)
    print(f'Test loss: {test_loss.item():.4f}')

    test_df['prediction'] = test_output.cpu().numpy()
    test_df['error'] = test_df['prediction'] - test_df['article_count']
    test_df['abs_error'] = test_df['error'].abs()

Test loss: 358812640870400.0000


In [14]:
test_df

Unnamed: 0_level_0,article_count,event_date,title_embed,summary_embed,lang_deu,lang_eng,lang_spa,lang_zho,prediction,error,abs_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
e_11,7,1387325000.0,"[[0.47470707, -0.08501352, 0.26899937, -0.3635...","[[0.3377615, -0.26158097, 0.3140225, -0.212071...",False,True,False,False,18942358.0,18942351.0,18942351.0
e_15,8,1387325000.0,"[[0.3985864, -0.06734807, 0.40732777, -0.46121...","[[0.5053371, -0.062929116, 0.27972195, -0.4317...",False,True,False,False,18942358.0,18942350.0,18942350.0
