# Section 1: Importing the libraries and reading the dataset

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
train_path = 'data/train-1.csv'
val_path = 'data/val-1.csv'
num_workers = 4

In [3]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)

In [4]:
attributes = train_data.columns[1:]

# Section 2: Preparing the dataset

In [5]:
import torch
from transformers import AutoTokenizer
from NarrativesDataset import NarrativesDataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
n_ds_train = NarrativesDataset(data_path = train_path, tokenizer = tokenizer, attributes = attributes)
n_ds_val = NarrativesDataset(data_path = val_path, tokenizer = tokenizer, attributes = attributes)

In [7]:
from DataModule import NarrativesDataModule

In [8]:
narratives_data_module = NarrativesDataModule(train_path = train_path, val_path = val_path, attributes = attributes, num_workers = num_workers)

In [9]:
narratives_data_module.setup()

In [10]:
narratives_data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x177d3979f50>

# Section 3: Model

In [11]:
from Model import NarrativesClassifier

In [12]:
config = {
    'model_name': 'distilroberta-base',
    'n_labels': len(attributes),
    'batch_size': 128,
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(narratives_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

In [13]:
model = NarrativesClassifier(config = config)

In [14]:
idx = 0
input_ids = n_ds_train.__getitem__(idx)['input_ids']
attention_mask = n_ds_train.__getitem__(idx)['attention_mask']
labels = n_ds_train.__getitem__(idx)['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim = 0), attention_mask.unsqueeze(dim = 0), labels.unsqueeze(dim = 0))
print(labels.shape, output.shape, output)

  attributes = torch.FloatTensor(item[self.attributes])


torch.Size([52]) torch.Size([1, 52]) tensor([[ 0.0362, -0.6145,  0.4954, -0.1430, -0.1388, -0.2135, -0.4763,  0.1844,
         -0.1920, -0.1797, -0.1824,  0.3460, -0.0841, -0.0117,  0.3120, -0.0570,
          0.1233,  0.2109, -0.3898, -0.1431,  0.4822, -0.0823, -0.0632,  0.4562,
          0.1804,  0.3494,  0.6205, -0.2177,  0.2459,  0.2278, -0.2668, -0.4082,
          0.1012,  0.4557, -0.3533,  0.0394, -0.3531, -0.2147,  0.3429, -0.2586,
          0.0225, -0.1630,  0.1727,  0.3355,  0.2939, -0.4444,  0.1970,  0.1957,
         -0.1840,  0.7461,  0.5237,  0.4791]], grad_fn=<AddmmBackward0>)


# Section 4: Training the model

In [15]:
narratives_data_module = NarrativesDataModule(train_path, val_path, attributes = attributes, batch_size = config['batch_size'], num_workers = num_workers)
narratives_data_module.setup()

In [16]:
model = NarrativesClassifier(config = config)

In [17]:
import pytorch_lightning as pl

# trainer = pl.Trainer(max_epochs = config['n_epochs'], num_sanity_val_steps = 50)
# trainer.fit(model, narratives_data_module)

In [18]:
import pickle

In [19]:
# with open('trainer.pkl', 'wb') as f:
#     pickle.dump(trainer, f)

In [26]:
#load trainer file
with open('trainers/trainer-1.pkl', 'rb') as f:
    trainer = pickle.load(f)

# Section 5: Predict

In [27]:
def classify_narratives(model, dm):
    predictions = trainer.predict(model, datamodule = dm)
    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
    return flattened_predictions

In [28]:
predictions = classify_narratives(model, narratives_data_module)




c:\Users\patri\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.


Predicting DataLoader 0: 100%|██████████| 3/3 [00:18<00:00,  0.16it/s]


In [29]:
print(predictions)

[[0.4982392  0.61150753 0.44828844 ... 0.46487066 0.4986256  0.4870718 ]
 [0.5031765  0.6037748  0.44484225 ... 0.46609148 0.50602067 0.4847694 ]
 [0.5080765  0.61372787 0.44679433 ... 0.4580413  0.5104502  0.49416658]
 ...
 [0.50784826 0.61677676 0.44505563 ... 0.4656463  0.5187624  0.5063217 ]
 [0.5062059  0.61167884 0.44529712 ... 0.45994824 0.50693816 0.49180958]
 [0.51076835 0.6154215  0.4462734  ... 0.4658167  0.516833   0.4850943 ]]


In [30]:
val_data = narratives_data_module.val_dataset.data
true_labels = np.array(val_data[attributes])

In [31]:
from sklearn.metrics import classification_report, accuracy_score

for i, attribute in enumerate(attributes):
    accuracy = accuracy_score(true_labels[:,i].astype(int), predictions[:,i].round())
    print('%s: %.2f' %(attribute, accuracy*100))

Misplacing or losing things: 6.73
Silly practical mistakes: 1.92
Trouble with pets: 98.08
Difficulties with friends: 98.08
Regrets over past decision/s: 98.08
Concerned about the meaning of life: 1.92
Being lonely: 1.92
Inability to express oneself: 65.38
Fear of rejection: 98.08
Trouble making decisions: 1.92
Physical appearance: 98.08
Not seeing people: 96.47
Troubling thoughts about one’s future: 1.92
Not enough personal energy: 1.92
Concerns about getting ahead: 98.08
Fear of confrontation: 98.08
Wasting time: 98.08
Not enough money for basic necessities (food, clothing, transportation, housing, healthcare etc.): 33.97
Not enough money for wants (entertainment and recreation): 95.83
Concerns about owing money: 98.08
Concerns about money for emergencies: 1.92
Financial security: 98.08
Not enough time to do things one needs to: 98.08
Too many responsibilities: 84.62
Not getting enough rest: 98.08
Too many interruptions: 98.08
Not enough time for entertainment and recreation: 5.13
Too