In [1]:
import pandas as pd
import json
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, BartForConditionalGeneration

In [3]:
df_train = pd.read_csv('dialogue.csv')
df_train['id'] = df_train['id'].astype(str)

# dataset = []
with open('labels.json','r') as f:
    labels = json.load(f)

tokenizer = AutoTokenizer.from_pretrained("philschmid/bart-large-cnn-samsum")

model = BartForConditionalGeneration.from_pretrained("philschmid/bart-large-cnn-samsum")

end = tokenizer.special_tokens_map['eos_token']
idx = 0
dataset = []
for i,item in enumerate(labels):
    temp = {}
    dialogues = df_train[df_train.id == item['id']][['speaker','text']].values
    dialogues = [': '.join(x) for x in dialogues]
    dialogue_str = '\n'.join(dialogues)
    temp['conversation'] = dialogue_str + end
    temp['action_item'] = 'Action Item:\n'+item['label'] + end 
    if len(tokenizer.encode(dialogue_str)) > 512:
        continue
    else:
        dataset.append(temp)
    if len(tokenizer.encode(temp['action_item'])) > 128:    
        print(len(tokenizer.encode(item['label'])))
        idx += 1
    if i  <= 2:
        print(temp)
print(idx)

{'conversation': "Zachary: Good morning everyone, let's start planning for our new construction project.\nHannah: We have received the blueprints for the project and have identified the materials needed.\nCaleb: I have contacted the suppliers and have received quotes for the materials.\nZachary: Great, let's discuss the timeline for the project.\nHannah: Based on the blueprints, we estimate the project will take 6 months to complete.\nCaleb: I have created a project plan with milestones and deadlines.\nZachary: Let's review the project plan and make any necessary adjustments.\nHannah: We should also consider any potential delays or issues that may arise during the project.\nCaleb: Agreed, we should have contingency plans in place.\nZachary: Let's also discuss the budget for the project.\nHannah: Based on the materials and labor costs, we estimate the project will cost $2 million.\nCaleb: I have created a budget plan and will ensure we stay within budget throughout the project.\nZachary

Token indices sequence length is longer than the specified maximum sequence length for this model (1336 > 1024). Running this sequence through the model will result in indexing errors


177
1


In [4]:
    
dataset = Dataset.from_list(dataset)
dataset_dict = dataset.train_test_split(test_size=0.2,seed = 2) 
# dataset_dict['test'].to_csv('test.csv') # Saving the data externally for testing on other dataxs

def preprocess_function(examples):
    model_inputs = tokenizer(examples['conversation'],max_length=512, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["action_item"], max_length=256, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['conversation','action_item'])

tokenized_datasets

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 165
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 42
    })
})

In [5]:
from torch.utils.data import DataLoader
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)

In [6]:
from transformers import get_scheduler
from torch.optim import AdamW
import torch
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

In [7]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        torch.cuda.empty_cache()
    print(total_loss/len(train_dataloader))

  0%|          | 0/210 [00:00<?, ?it/s]

4.542540845416841
0.39337518953141715
0.159246563911438
0.09750879130193166
0.07148587615007446
0.054901179103624256
0.0455659870945272
0.0390991759264753
0.036155523554909794
0.03193759714208898


In [21]:
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        summary_ids = model.generate(batch["input_ids"])
        action_items = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        break

In [29]:
print(dataset_dict['test'][6]['conversation'])
print(dataset_dict['test'][6]['action_item'])
print(action_items[6])



Jenna: Good morning everyone, let's start our meeting on drug safety concerns.
Benjamin: I have received some reports of adverse reactions to our new drug from a few patients.
Andrew: I have also received similar reports from some doctors who have prescribed the drug.
Nicholas: We need to investigate these reports and determine if there is a safety issue with the drug.
Jenna: Agreed, we need to take this seriously and investigate thoroughly.
Benjamin: I have already started gathering data on the reported cases and will share it with the team.
Andrew: We should also review the clinical trial data to see if there were any indications of safety concerns.
Nicholas: I will review the manufacturing process to see if there were any issues that could have led to the reported adverse reactions.
Jenna: Great, let's all work together to get to the bottom of this and ensure the safety of our patients.
Benjamin: I will also reach out to the patients who reported adverse reactions to get more inform

In [31]:
model.save_pretrained('bart-finetuned-action-items')
tokenizer.save_pretrained('bart-finetuned-action-items')

('bart-finetuned-action-items/tokenizer_config.json',
 'bart-finetuned-action-items/special_tokens_map.json',
 'bart-finetuned-action-items/vocab.json',
 'bart-finetuned-action-items/merges.txt',
 'bart-finetuned-action-items/added_tokens.json',
 'bart-finetuned-action-items/tokenizer.json')