# Section 1: Importing the libraries and reading the dataset

In [32]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [33]:
train_path = 'data/train-1.csv'
val_path = 'data/val-1.csv'

train_path = 'hierarchical_data/hierarchical_train-1.csv'
val_path = 'hierarchical_data/hierarchical_val-1.csv'


num_workers = 4

In [34]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

In [35]:
attributes = train_data.columns[1:]

# Section 2: Preparing the dataset

In [36]:
import torch
from transformers import AutoTokenizer
from NarrativesDataset import NarrativesDataset

In [37]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
n_ds_train = NarrativesDataset(data_path = train_path, tokenizer = tokenizer, attributes = attributes)
n_ds_val = NarrativesDataset(data_path = val_path, tokenizer = tokenizer, attributes = attributes)

In [38]:
from DataModule import NarrativesDataModule

In [39]:
narratives_data_module = NarrativesDataModule(train_path = train_path, val_path = val_path, attributes = attributes, num_workers = num_workers)

In [40]:
narratives_data_module.setup()

In [41]:
narratives_data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x177a4303250>

# Section 3: Model

In [42]:
from Model import NarrativesClassifier

In [43]:
config = {
    'model_name': 'distilroberta-base',
    'n_labels': len(attributes),
    'batch_size': 128,
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(narratives_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

In [44]:
model = NarrativesClassifier(config = config)

In [45]:
idx = 0
input_ids = n_ds_train.__getitem__(idx)['input_ids']
attention_mask = n_ds_train.__getitem__(idx)['attention_mask']
labels = n_ds_train.__getitem__(idx)['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim = 0), attention_mask.unsqueeze(dim = 0), labels.unsqueeze(dim = 0))
print(labels.shape, output.shape, output)

torch.Size([8]) torch.Size([1, 8]) tensor([[ 0.5596,  0.3617,  0.0534, -0.4098,  0.5268,  0.4683, -0.0727, -0.5751]],
       grad_fn=<AddmmBackward0>)


  attributes = torch.FloatTensor(item[self.attributes])


# Section 4: Training the model

In [46]:
narratives_data_module = NarrativesDataModule(train_path, val_path, attributes = attributes, batch_size = config['batch_size'], num_workers = num_workers)
narratives_data_module.setup()

In [47]:
model = NarrativesClassifier(config = config)

KeyboardInterrupt: 

In [None]:
import pytorch_lightning as pl

trainer = pl.Trainer(max_epochs = config['n_epochs'], num_sanity_val_steps = 50)
trainer.fit(model, narratives_data_module)

In [None]:
import pickle

In [None]:
with open('hierarchical-trainer.pkl', 'wb') as f:
    pickle.dump(trainer, f)

In [None]:
# #load trainer file
# with open('trainers/trainer-1.pkl', 'rb') as f:
#     trainer = pickle.load(f)

# Section 5: Predict

In [None]:
def classify_narratives(model, dm):
    predictions = trainer.predict(model, datamodule = dm)
    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
    return flattened_predictions

In [None]:
predictions = classify_narratives(model, narratives_data_module)




c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting DataLoader 0: 100%|██████████| 3/3 [00:18<00:00,  0.16it/s]


In [None]:
print(predictions)

[[0.4982392  0.61150753 0.44828844 ... 0.46487066 0.4986256  0.4870718 ]
 [0.5031765  0.6037748  0.44484225 ... 0.46609148 0.50602067 0.4847694 ]
 [0.5080765  0.61372787 0.44679433 ... 0.4580413  0.5104502  0.49416658]
 ...
 [0.50784826 0.61677676 0.44505563 ... 0.4656463  0.5187624  0.5063217 ]
 [0.5062059  0.61167884 0.44529712 ... 0.45994824 0.50693816 0.49180958]
 [0.51076835 0.6154215  0.4462734  ... 0.4658167  0.516833   0.4850943 ]]


In [None]:
val_data = narratives_data_module.val_dataset.data
true_labels = np.array(val_data[attributes])

In [None]:
from sklearn.metrics import classification_report, accuracy_score

for i, attribute in enumerate(attributes):
    accuracy = accuracy_score(true_labels[:,i].astype(int), predictions[:,i].round())
    print('%s: %.2f' %(attribute, accuracy*100))

Misplacing or losing things: 6.73
Silly practical mistakes: 1.92
Trouble with pets: 98.08
Difficulties with friends: 98.08
Regrets over past decision/s: 98.08
Concerned about the meaning of life: 1.92
Being lonely: 1.92
Inability to express oneself: 65.38
Fear of rejection: 98.08
Trouble making decisions: 1.92
Physical appearance: 98.08
Not seeing people: 96.47
Troubling thoughts about one’s future: 1.92
Not enough personal energy: 1.92
Concerns about getting ahead: 98.08
Fear of confrontation: 98.08
Wasting time: 98.08
Not enough money for basic necessities (food, clothing, transportation, housing, healthcare etc.): 33.97
Not enough money for wants (entertainment and recreation): 95.83
Concerns about owing money: 98.08
Concerns about money for emergencies: 1.92
Financial security: 98.08
Not enough time to do things one needs to: 98.08
Too many responsibilities: 84.62
Not getting enough rest: 98.08
Too many interruptions: 98.08
Not enough time for entertainment and recreation: 5.13
Too