# Section 1: Import Libraries and Reading the dataset

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import pytorch_lightning as pl


from transformers import AutoTokenizer
from sklearn.metrics import classification_report, accuracy_score

from NarrativesDataset import NarrativesDataset
from Model import NarrativesClassifier
from DataModule import NarrativesDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# filenames = ['General_hassles', 'Inner_concerns', 'Financial_concerns', 'Time_Pressures', 'Environmental_Hassles', 'Family_Hassles', 'Health_Hassles', 'Academic_Hassles']

train_path = 'hierarchical_data/train_pt2/'
val_path = 'hierarchical_data/val_pt2/'

train_extension = '_train.csv'
val_extension = '_val.csv'

num_workers = 4

In [3]:
# train_data = {}
# val_data = {}

In [4]:
# for filename in filenames:
#     train_file = train_path + filename + train_extension
#     val_file = val_path + filename + val_extension
    
#     train_data[filename] = pd.read_csv(train_file)
#     val_data[filename] = pd.read_csv(val_file)

# Section 2: Preparing the Dataset

In [5]:
def read_dataset(filename):
    train_file = train_path + filename + train_extension
    val_file = val_path + filename + val_extension

    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    return train_data, val_data, train_file, val_file

In [6]:
def prepare_dataset(train_path, val_path, attributes):
  model_name = 'roberta-base'
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  n_ds_train = NarrativesDataset(data_path = train_path, tokenizer = tokenizer, attributes = attributes)
  n_ds_val = NarrativesDataset(data_path = val_path, tokenizer = tokenizer, attributes = attributes)

  narratives_data_module = NarrativesDataModule(train_path = train_path, val_path = val_path, attributes = attributes, num_workers = num_workers)
  narratives_data_module.setup()
  narratives_data_module.train_dataloader()

  return narratives_data_module, n_ds_train, n_ds_val


# Section 3: Model

In [7]:
def initialize_model_config(attributes, narratives_data_module):
  config = {
      'model_name': 'distilroberta-base',
      'n_labels': len(attributes),
      'batch_size': 128,
      'lr': 1.5e-6,
      'warmup': 0.2,
      'train_size': len(narratives_data_module.train_dataloader()),
      'weight_decay': 0.001,
      'n_epochs': 100
  }
  return config

In [8]:
def initialize_model(attributes, narratives_data_module, n_ds_train):
  config = initialize_model_config(attributes, narratives_data_module)
  model = NarrativesClassifier(config = config)

  idx = 0
  input_ids = n_ds_train.__getitem__(idx)['input_ids']
  attention_mask = n_ds_train.__getitem__(idx)['attention_mask']
  labels = n_ds_train.__getitem__(idx)['labels']

  model.cpu()
  loss, output = model(input_ids.unsqueeze(dim = 0), attention_mask.unsqueeze(dim = 0), labels.unsqueeze(dim = 0))
  print(labels.shape, output.shape, output)
  return model

# Section 4: Model Training

In [9]:
def train_model(attributes, config, train_file, val_file):
    narratives_data_module = NarrativesDataModule(train_file, val_file, attributes = attributes, batch_size = config['batch_size'], num_workers = num_workers)
    narratives_data_module.setup()
    model = NarrativesClassifier(config = config)
    trainer = pl.Trainer(max_epochs = config['n_epochs'], num_sanity_val_steps = 50)
    trainer.fit(model, narratives_data_module)
    return trainer

# Section 5: Model Prediction

In [10]:
def classify_narratives(model, trainer, dm):
    predictions = trainer.predict(model, datamodule = dm)
    flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
    return flattened_predictions

def make_predictions(model, trainer, narratives_data_module, attributes):
    predictions = classify_narratives(model, trainer, narratives_data_module)
    val_data = narratives_data_module.val_dataset.data
    true_labels = np.array(val_data[attributes])
    for i, attribute in enumerate(attributes):
        # accuracy = accuracy_score(true_labels[:,i].astype(int), predictions[:,i].round())
        # print('%s: %.2f' %(attribute, accuracy*100))

        report = classification_report(true_labels[:,i].astype(int), predictions[:,i].round())
        print('Classification Report for %s:\n%s' % (attribute, report))
        # print(predictions)
    return predictions

In [None]:
def show_predictions(predictions, attributes, val_data):
    true_labels = np.array(val_data[attributes])

    binary_predictions = np.where(predictions > 0.5, 1, 0)

    comparison_df = pd.DataFrame()
    comparison_df['Narrative'] = val_data['Narrative']

    comparison_data = np.concatenate((true_labels, binary_predictions), axis=1)
    columns = [f'True_{attr}' for attr in attributes] + [f'Pred_{attr}' for attr in attributes]
    comparison_data_df = pd.DataFrame(comparison_data, columns=columns)

    comparison_df = pd.concat([comparison_df, comparison_data_df], axis=1)

    return comparison_df

# Section 6 Looping through the dataset

In [11]:
filenames = ['General_hassles', 'Inner_concerns', 'Financial_concerns', 'Time_Pressures', 'Environmental_Hassles', 'Family_Hassles', 'Health_Hassles', 'Academic_Hassles']
filenames = ['Time_Pressures', 'Environmental_Hassles', 'Family_Hassles', 'Health_Hassles', 'Academic_Hassles']
comparison_dfs = {}

In [12]:
import joblib
for filename in filenames:
  train_data, val_data, train_file, val_file = read_dataset(filename)
  attributes = train_data.columns[1:]
  narratives_data_module, n_ds_train, n_ds_val = prepare_dataset(train_file, val_file, attributes)
  config = initialize_model_config(attributes, narratives_data_module)
  model = initialize_model(attributes, narratives_data_module, n_ds_train)
  trainer = train_model(attributes, config, train_file, val_file)
  model_filename = f"{filename}_model.pkl"
  torch.save(model, '/content/drive/Shareddrives/THSExperiment/models/' + model_filename)
  predictions = make_predictions(model, trainer, narratives_data_module, attributes)
  comparison_df = show_predictions(predictions, attributes, val_data)
  comparison_dfs[filename] = comparison_df