# Section 1: Importing the libraries and reading the dataset

In [22]:
import os
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt


In [23]:
train_path = 'data/train-1.csv'
val_path = 'data/val-1.csv'
num_workers = 4

In [24]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

In [25]:
attributes = train_data.columns[1:]

# Section 2: Preparing the dataset

In [26]:
import torch
from transformers import AutoTokenizer
from NarrativesDataset import NarrativesDataset

In [27]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
n_ds_train = NarrativesDataset(data_path = train_path, tokenizer = tokenizer, attributes = attributes)
n_ds_val = NarrativesDataset(data_path = val_path, tokenizer = tokenizer, attributes = attributes)

In [28]:
from DataModule import NarrativesDataModule

In [29]:
narratives_data_module = NarrativesDataModule(train_path = train_path, val_path = val_path, attributes = attributes, num_workers = num_workers)

In [30]:
narratives_data_module.setup()

In [31]:
narratives_data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x2360920c290>

# Section 3: Model

In [32]:
from Model import NarrativesClassifier

In [33]:
config = {
    'model_name': 'distilroberta-base',
    'n_labels': len(attributes),
    'batch_size': 128,
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(narratives_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

In [34]:
model = NarrativesClassifier(config = config)

In [35]:
idx = 0
input_ids = n_ds_train.__getitem__(idx)['input_ids']
attention_mask = n_ds_train.__getitem__(idx)['attention_mask']
labels = n_ds_train.__getitem__(idx)['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim = 0), attention_mask.unsqueeze(dim = 0), labels.unsqueeze(dim = 0))
print(labels.shape, output.shape, output)

torch.Size([52]) torch.Size([1, 52]) tensor([[-0.2860,  0.0134, -0.3681, -0.2445, -0.1817, -0.2962, -0.1619, -0.0843,
          0.0527, -0.4062, -0.1561,  0.0338, -0.2610, -0.0368,  0.4710,  0.0472,
         -0.2934,  0.2755, -0.2753,  0.1906, -0.4113, -0.2892, -0.2233,  0.1624,
          0.1170,  0.3357,  0.1812, -0.5657, -0.1702,  0.0760, -0.1874, -0.3793,
         -0.0445,  0.3741, -0.1314,  0.4150, -0.2383, -0.6159, -0.4942,  0.1915,
         -0.3192,  0.2654, -0.1000, -0.5380,  0.6963,  0.6142,  0.0562,  0.0946,
         -0.2561, -0.1542, -0.0482, -0.4664]], grad_fn=<AddmmBackward0>)


  attributes = torch.FloatTensor(item[self.attributes])


# Section 4: Training the model

In [36]:
narratives_data_module = NarrativesDataModule(train_path, val_path, attributes = attributes, batch_size = config['batch_size'], num_workers = num_workers)
narratives_data_module.setup()

In [37]:
model = NarrativesClassifier(config = config)

In [39]:
import pytorch_lightning as pl

trainer = pl.Trainer(max_epochs = config['n_epochs'], num_sanity_val_steps = 50)
trainer.fit(model, narratives_data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs



  | Name             | Type              | Params
-------------------------------------------------------
0 | pretrained_model | RobertaModel      | 82.1 M
1 | hidden           | Linear            | 590 K 
2 | classifier       | Linear            | 40.0 K
3 | loss_func        | BCEWithLogitsLoss | 0     
4 | dropout          | Dropout           | 0     
-------------------------------------------------------
82.7 M    Trainable params
0         Non-trainable params
82.7 M    Total params
330.996   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.
c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The number of training batches (13) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 33:  31%|███       | 4/13 [01:41<03:47,  0.04it/s, v_num=1, train loss=0.348, validation loss =0.272] 

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [40]:
import pickle

In [41]:
with open('trainer.pkl', 'wb') as f:
    pickle.dump(trainer, f)