In [1]:
import pandas as pd
import json
from datasets import Dataset
from transformers import AutoTokenizer, BartForConditionalGeneration, T5ForConditionalGeneration, BloomForCausalLM

In [2]:
from yaml.loader import SafeLoader
import yaml
from fuzzywuzzy import fuzz
import sys
sys.path.append('../')
from utils import *
with open('../env.yml','r') as f:
    data = yaml.load(f, Loader=SafeLoader)
openai.organization = data["OPEN_API_ORG"]
openai.api_key = data["OPENAI_API_KEY"]

In [3]:

def evaluate(action_true,action_pred):
    
    if not action_true and not action_pred:
        return (1,0,0,0)
    elif action_true and not action_pred:
        return (0,0,1,0)
    elif not action_true and action_pred:
        return (0,0,0,1)
    else:
    
        embed_1 = [get_embedding(item['text']) for item in action_true]
        embed_2 = [get_embedding(item['text']) for item in action_pred]

        scores = cos_sim(embed_1,embed_2)
        top_idx = torch.argmax(scores,dim=1)
        exact_match = 0
        wrong_assignee = 0
        not_found = 0
        extra_generated = len(action_pred) - len(action_true) if len(action_pred) > len(action_true) else 0
        for i,idx in enumerate(top_idx):
            if scores[i][idx] > 0.85:
                if fuzz.partial_ratio(action_true[i]['assignee'],action_pred[idx]['assignee']) >= 100:
                    exact_match += 1
                else:
                    wrong_assignee += 1
            else:
                not_found += 1
        metrics = [exact_match,wrong_assignee,not_found]
        metrics = [x/len(action_true) for x in metrics] + [extra_generated]
        return tuple(metrics)

In [4]:
df_train = pd.read_csv('../data/datasets/dialogue.csv')
df_train['id'] = df_train['id'].astype(str)


with open('../data/datasets/labels.json','r') as f:
    labels = json.load(f)

model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if 't5' in model_name:
    model = T5ForConditionalGeneration.from_pretrained(model_name)
elif 'bloom' in model_name:
    model = BloomForCausalLM.from_pretrained(model_name)
else:
    model = BartForConditionalGeneration.from_pretrained(model_name)

tokenizer.add_tokens(['[SEP]'])
model.resize_token_embeddings(len(tokenizer))

Downloading tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/13.8M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Embedding(250681, 1024)

In [5]:
# end = tokenizer.special_tokens_map['eos_token']
idx = 0
dataset = []
dialogue_len = 0
max_len = 0
for i,item in enumerate(labels):
    temp = {}
    dialogues = df_train[df_train.id == item['id']][['speaker','text']].values
    dialogues = [': '.join(x) for x in dialogues]
    dialogue_str = '\n'.join(dialogues)
    if 't5' in model_name:
        dialogue_str = 'Find Action Items from the following chat:\n' + dialogue_str
    if 'bloom' in model_name:
        res = 'Action Item:\n'+item['label']
        res = res.replace('\n','[SEP]')
        dialogue_str = dialogue_str + '[SEP][SEP]' + res
        temp['conversation'] = dialogue_str
        temp['action_item'] = res
    else:
        temp['conversation'] = dialogue_str
        temp['action_item'] = 'Action Item:\n'+item['label']
        temp['action_item'] = temp['action_item'].replace('\n','[SEP]')
    if len(tokenizer.encode(dialogue_str)) > 512:
        continue
    else:
        dialogue_len += len(tokenizer.encode(dialogue_str))
        if len(tokenizer.encode(dialogue_str.split("[SEP][SEP]")[0])) > max_len:
            max_len = len(tokenizer.encode(dialogue_str))
        dataset.append(temp)
print(dialogue_len/len(dataset))
print(max_len)

302.69117647058823
463


In [6]:
dataset = Dataset.from_list(dataset)
dataset_dict = dataset.train_test_split(test_size=0.2,seed = 2) 
# dataset_dict['test'].to_csv('test.csv') # Saving the data externally for testing
# dataset_dict = dataset_dict["train"].train_test_split(test_size=0.1,seed=2)
def preprocess_function(examples):
    model_inputs = tokenizer(examples['conversation'],max_length=512, padding="max_length")
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["action_item"], max_length=256, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    if "bloom" in model_name:
        inp_text = [text.split("[SEP][SEP]")[0] for text in examples['conversation']]
        generation_tokens = tokenizer(inp_text,max_length=463, padding="max_length")
        model_inputs["genrate_input_ids"] = generation_tokens["input_ids"]
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['conversation','action_item'])

tokenized_datasets

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'genrate_input_ids'],
        num_rows: 163
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'genrate_input_ids'],
        num_rows: 41
    })
})

In [7]:
print(dataset_dict['test'][6]['conversation'])

Matthew: Good morning everyone, let's start discussing our company-wide wellness program.
Taylor: I think we should start with a survey to understand what our employees need and want in terms of wellness programs.
Emily: That's a great idea, Taylor. We can use an online survey tool to collect the data.
Matthew: Agreed. Once we have the data, we can prioritize the programs based on the needs of our employees.
Taylor: I think we should also consider offering some mental health resources, like counseling or meditation classes.
Emily: Yes, mental health is just as important as physical health. We can partner with local wellness centers to offer these resources.
Matthew: That's a great idea, Emily. We can also consider offering fitness classes or gym memberships as part of the program.
Taylor: I think we should also have a wellness challenge, like a step challenge or a healthy eating challenge, to encourage participation.
Emily: That's a great idea, Taylor. We can offer prizes for the winne

In [8]:
from torch.utils.data import DataLoader
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=2)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=2)

In [9]:
from transformers import get_scheduler
from torch.optim import AdamW
import torch
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

In [10]:
import wandb
run = wandb.init(
  project="action-item-extractor",
  notes="architecture-comparisson",
)

wandb.config = {
"epochs" : num_epochs,
"train_batch_size" : 2,
"model_architecture" : "bloom-560m",
"pretraining_dataset" : "N/A",   
    }

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))


def process_output(out_ids):
    action_items = tokenizer.batch_decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    action_items = [x.split('[SEP]')[1:] for x in action_items]
    final_preds = []
    for item in action_items:
        temp =  []
        if item:
            for action in item:
                # print(action)
                try:
                    temp.append({'text':action.split('||')[0],'assignee':action.split('||')[1]})
                except:
                    continue
        final_preds.append(temp)
    return final_preds    


model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        if "bloom" in model_name:
            batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids','attention_mask']}
            outputs = model(input_ids=batch["input_ids"],
                            attention_mask = batch["attention_mask"],
                            labels = batch["input_ids"])
        else:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        torch.cuda.empty_cache()
        # break
    with torch.no_grad():
        eval_loss = 0
        exact_match = 0
        wrong_assignee = 0
        not_found = 0
        extra_generated = 0
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            if "bloom" in model_name:
                outputs = model(input_ids=batch["input_ids"],
                                attention_mask = batch["attention_mask"],
                                labels = batch["input_ids"])
            else:
                outputs = model(**batch)
            eval_loss += outputs.loss.item()
            
            true_vals = process_output(batch["labels"])
            if "bloom" in model_name:
                pred_ids = model.generate(batch["genrate_input_ids"],max_length=512)
            else:
                pred_ids = model.generate(batch["input_ids"],max_length=256)
            pred_vals = process_output(pred_ids)
            metrics = [evaluate(true_vals[i],pred_vals[i]) for i in range(batch['input_ids'].shape[0])]
            metrics = [sum(i) for i in zip(*metrics)]
            exact_match += metrics[0]
            wrong_assignee += metrics[1]
            not_found += metrics[2]
            extra_generated += metrics[3]
            # break
            
    print(f'Epoch {epoch}')
    print('Train Loss: ',total_loss/len(dataset_dict['train']))
    print('Eval Loss: ',eval_loss/len(eval_dataloader))
    print('Exact Match: ', exact_match/(len(dataset_dict['test'])))
    print('Wrong Assignee: ', wrong_assignee/(len(dataset_dict['test'])))
    print('Not Found: ', not_found/(len(dataset_dict['test'])))
    print('Extra Generated: ', extra_generated/(len(dataset_dict['test'])))
    log_dict = {
    "train_loss" : total_loss/len(dataset_dict['train']),
    "eval_loss" :  eval_loss/len(eval_dataloader),
    "exact_match" : exact_match/(len(dataset_dict['test'])),
    "wrong_assignee" : wrong_assignee/(len(dataset_dict['test'])),
    "not_found" : not_found/(len(dataset_dict['test'])),
    'extra_generated' : extra_generated/(len(dataset_dict['test']))    
    }
    wandb.log(log_dict)
    print("-"*30)
       
            
        
    

  0%|          | 0/820 [00:00<?, ?it/s]

Epoch 0
Train Loss:  0.9181063932143837
Eval Loss:  0.9229989051818848
Exact Match:  0.24285714285714285
Wrong Assignee:  0.24308943089430896
Not Found:  0.44088269454123113
Extra Generated:  1.2926829268292683
------------------------------
Epoch 1
Train Loss:  0.2805723613756566
Eval Loss:  0.9517552909396944
Exact Match:  0.31329849012775834
Wrong Assignee:  0.24477351916376305
Not Found:  0.3931475029036005
Extra Generated:  0.2926829268292683
------------------------------
Epoch 2
Train Loss:  0.1588135102043854
Eval Loss:  1.0604251367705209
Exact Match:  0.28792102206736353
Wrong Assignee:  0.2576074332171894
Not Found:  0.356910569105691
Extra Generated:  0.8048780487804879
------------------------------
Epoch 3
Train Loss:  0.09773156618230913
Eval Loss:  1.202949773697626
Exact Match:  0.29698025551684093
Wrong Assignee:  0.2786875725900116
Not Found:  0.3511614401858304
Extra Generated:  0.4146341463414634
------------------------------
Epoch 4
Train Loss:  0.058605619277690

In [30]:
torch.cuda.empty_cache()
with torch.no_grad():
        eval_loss = 0
        exact_match = 0
        wrong_assignee = 0
        not_found = 0
        extra_generated = 0
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            # if "bloom" in model_name:
            #     outputs = model(input_ids=batch["input_ids"],
            #                     attention_mask = batch["attention_mask"],
            #                     labels = batch["input_ids"])
            # else:
            #     outputs = model(**batch)
            # eval_loss += outputs.loss.item()
            
            true_vals = process_output(batch["labels"])
            if "bloom" in model_name:
                pred_ids = model.generate(batch["genrate_input_ids"],max_length=550)
            else:
                pred_ids = model.generate(batch["input_ids"],max_length=256)
            pred_vals = process_output(pred_ids)
            metrics = [evaluate(true_vals[i],pred_vals[i]) for i in range(batch['input_ids'].shape[0])]
            metrics = [sum(i) for i in zip(*metrics)]
            exact_match += metrics[0]
            wrong_assignee += metrics[1]
            not_found += metrics[2]
            extra_generated += metrics[3]
        print('Exact Match: ', exact_match/(len(dataset_dict['test'])))
        print('Wrong Assignee: ', wrong_assignee/(len(dataset_dict['test'])))
        print('Not Found: ', not_found/(len(dataset_dict['test'])))
        print('Extra Generated: ', extra_generated/(len(dataset_dict['test'])))

Exact Match:  0.3643437862950058
Wrong Assignee:  0.28583042973286876
Not Found:  0.27665505226480835
Extra Generated:  2.902439024390244


In [28]:
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        summary_ids = model.generate(batch["genrate_input_ids"],max_length=768)
        action_items = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        break

In [22]:
len(action_items)

2

In [29]:
print(dataset_dict['test'][0]['conversation'].split('[SEP][SEP]')[0])
# print(dataset_dict['test'][6]['action_item'])
print(action_items[0].replace('[SEP]','\n'))



Lauren: Good morning everyone, let's start with our market segmentation analysis meeting.
Dylan: I have been researching the demographics of our target audience and have found that our product appeals mostly to young adults aged 18-35.
Stephanie: I have been analyzing the psychographics of our target audience and have found that they value convenience and affordability.
Matthew: I have been looking at the geographic distribution of our target audience and have found that they are mostly located in urban areas.
Lauren: Great work everyone. Based on this information, we can create targeted marketing campaigns that focus on convenience and affordability for young adults in urban areas.
Dylan: I also found that our target audience is mostly tech-savvy and spends a lot of time on social media.
Stephanie: That's a great point, Dylan. We can use social media platforms to reach our target audience and promote our product.
Matthew: I have also found that our target audience is interested in eco

In [31]:
model.save_pretrained('../artifacts/bloom-560m-action-items')
tokenizer.save_pretrained('../artifacts/bloom-560m-action-items')

('../artifacts/bloom-560m-action-items/tokenizer_config.json',
 '../artifacts/bloom-560m-action-items/special_tokens_map.json',
 '../artifacts/bloom-560m-action-items/tokenizer.json')

In [None]:
huggingface-cli repo create bloom-560m-action-items