# Section 1: Importing the libraries and reading the dataset

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
train_path = 'data/train-1.csv'
val_path = 'data/val-1.csv'

train_path = 'hierarchical_data/hierarchical_train-1.csv'
val_path = 'hierarchical_data/hierarchical_val-1.csv'


num_workers = 4

In [3]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

In [4]:
attributes = train_data.columns[1:]

# Section 2: Preparing the dataset

In [5]:
import torch
from transformers import AutoTokenizer
from NarrativesDataset import NarrativesDataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
n_ds_train = NarrativesDataset(data_path = train_path, tokenizer = tokenizer, attributes = attributes)
n_ds_val = NarrativesDataset(data_path = val_path, tokenizer = tokenizer, attributes = attributes)

In [7]:
from DataModule import NarrativesDataModule

In [8]:
narratives_data_module = NarrativesDataModule(train_path = train_path, val_path = val_path, attributes = attributes, num_workers = num_workers)

In [9]:
narratives_data_module.setup()

In [10]:
narratives_data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x1c6c9b2cd90>

# Section 3: Model

In [11]:
from Model import NarrativesClassifier

In [12]:
config = {
    'model_name': 'distilroberta-base',
    'n_labels': len(attributes),
    'batch_size': 128,
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(narratives_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

In [13]:
model = NarrativesClassifier(config = config)

In [14]:
idx = 0
input_ids = n_ds_train.__getitem__(idx)['input_ids']
attention_mask = n_ds_train.__getitem__(idx)['attention_mask']
labels = n_ds_train.__getitem__(idx)['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim = 0), attention_mask.unsqueeze(dim = 0), labels.unsqueeze(dim = 0))
print(labels.shape, output.shape, output)

  attributes = torch.FloatTensor(item[self.attributes])


torch.Size([8]) torch.Size([1, 8]) tensor([[-0.2916,  0.1291,  0.2460,  0.6595,  0.2413,  0.1225,  0.2652, -0.3393]],
       grad_fn=<AddmmBackward0>)


# Section 4: Training the model

In [15]:
narratives_data_module = NarrativesDataModule(train_path, val_path, attributes = attributes, batch_size = config['batch_size'], num_workers = num_workers)
narratives_data_module.setup()

In [16]:
model = NarrativesClassifier(config = config)

In [17]:
import pytorch_lightning as pl

trainer = pl.Trainer(max_epochs = config['n_epochs'], num_sanity_val_steps = 50)
trainer.fit(model, narratives_data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs






  | Name             | Type              | Params
-------------------------------------------------------
0 | pretrained_model | RobertaModel      | 82.1 M
1 | hidden           | Linear            | 590 K 
2 | classifier       | Linear            | 6.2 K 
3 | loss_func        | BCEWithLogitsLoss | 0     
4 | dropout          | Dropout           | 0     
-------------------------------------------------------
82.7 M    Trainable params
0         Non-trainable params
82.7 M    Total params
330.861   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.
c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The number of training batches (13) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 5:  46%|████▌     | 6/13 [02:09<02:31,  0.05it/s, v_num=3, train loss=0.723, validation loss =0.678] 

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
import pickle

In [None]:
with open('hierarchical-trainer.pkl', 'wb') as f:
    pickle.dump(trainer, f)

In [None]:
# #load trainer file
# with open('trainers/trainer-1.pkl', 'rb') as f:
#     trainer = pickle.load(f)

# Section 5: Predict

In [18]:
def classify_narratives(model, dm):
    predictions = trainer.predict(model, datamodule = dm)
    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
    return flattened_predictions

In [19]:
predictions = classify_narratives(model, narratives_data_module)

c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting DataLoader 0: 100%|██████████| 3/3 [00:17<00:00,  0.17it/s]


In [20]:
print(predictions)

[[0.5462803  0.43625283 0.51389575 ... 0.4591307  0.5040209  0.39239407]
 [0.55003816 0.4458562  0.5141394  ... 0.45433712 0.49988213 0.39162004]
 [0.5431149  0.4310108  0.5165093  ... 0.45605245 0.51269054 0.39648765]
 ...
 [0.5416139  0.44052896 0.5199396  ... 0.46050277 0.5233529  0.39219585]
 [0.5396316  0.43925366 0.5143528  ... 0.46230105 0.51104724 0.38736975]
 [0.52887166 0.4351338  0.5208309  ... 0.46387988 0.5167864  0.39024898]]


In [21]:
val_data = narratives_data_module.val_dataset.data
true_labels = np.array(val_data[attributes])

In [22]:
from sklearn.metrics import classification_report, accuracy_score

for i, attribute in enumerate(attributes):
    accuracy = accuracy_score(true_labels[:,i].astype(int), predictions[:,i].round())
    print('%s: %.2f' %(attribute, accuracy*100))

General hassles: 7.69
Inner concerns: 75.00
Financial concerns: 9.94
Time Pressures: 45.83
Environmental Hassles: 29.49
Family Hassles: 94.23
Health Hassles: 20.83
Academic Hassles: 86.54
