In [10]:
!git clone https://"username":"password"@github.com/Dia-Bete/BeteQA.git

Cloning into 'BeteQA'...
remote: Enumerating objects: 300, done.[K
remote: Counting objects: 100% (300/300), done.[K
remote: Compressing objects: 100% (165/165), done.[K
remote: Total 300 (delta 170), reused 250 (delta 129), pack-reused 0[K
Receiving objects: 100% (300/300), 1.87 MiB | 1.33 MiB/s, done.
Resolving deltas: 100% (170/170), done.


In [1]:
import numpy as np
from sklearn.metrics import average_precision_score
import os
os.chdir('BeteQA')

import nltk
nltk.download('stopwords')
nltk.download('punkt')

# !pip3 install transformers
# !pip3 install tensorflow_text
# !wget https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3?tf-hub-format=compressed
# !mv /content/BeteQA/3?tf-hub-format=compressed muse.tar.gz
# !mkdir muse
# !tar -xvf muse.tar.gz -C muse

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tcastrof/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tcastrof/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import csv

testset = {}
with open('QA/testset.csv') as f:
  reader = csv.reader(f, delimiter=',', quotechar='\"')
  for row in list(reader)[1:]:
    q1, q2, answer, label, _ = row
    if q1.strip() not in testset:
      testset[q1] = []
    testset[q1].append({
        'question': q2.strip(),
        'answer': answer,
        'label': label
    })

corpora_path = [
        'QA/adalberto/qa_pt.json',
        'QA/diabetes_action/qa_pt.json',
        'QA/diabetesbr/qa_pt.json',
        'QA/eatingwell/qa_pt.json',
        'QA/medicine/qa_pt.json'
    ]

## BM25

In [3]:
from models.bm25 import BM25QA

model = BM25QA(corpora_path)

bm25 = []
MAP = []
for q1 in testset:
    y_real, y_pred = [], []
    size = len(testset[q1])
    for i, row in enumerate(testset[q1]):
        q2 = row['question']
        label = row['label']

        score = model.score_pair(q1, q2)

        y_real.append(1 if label in ['S', 'R'] else 0)
        y_pred.append(score)
        
        bm25.append({
            'qid': i+1,
            'class': 2 if label == 'S' else 1 if label == 'R' else 0,
            'score': score,
            'q': q1,
            'candidate': q2
        })
        
        if len(list(set(y_real))) > 1:
            MAP.append(average_precision_score(y_real, y_pred))
print("BM25 Score:", round(np.mean(MAP), 2))

BM25 Score: 0.72


## TF-IDF Cosine

In [5]:
from models.cosine import TFIDFCosineQA

model = TFIDFCosineQA(corpora_path)

MAP = []
tfidf = []
for i, q1 in enumerate(testset):
    y_real, y_pred = [], []
    for row in testset[q1]:
        q2 = row['question']
        label = row['label']
        score = model.score_pair(q1, q2)
        y_real.append(1 if label in ['S', 'R'] else 0)
        y_pred.append(score)

        tfidf.append({
            'qid': i+1,
            'class': 2 if label == 'S' else 1 if label == 'R' else 0,
            'score': score,
            'q': q1,
            'candidate': q2
        })

    if len(list(set(y_real))) > 1:
        MAP.append(average_precision_score(y_real, y_pred))
print("TF-IDF Score:", round(np.mean(MAP), 2))

TF-IDF Score: 0.78


## SoftCosine with BERT

In [7]:
from models.softcosine import TFIDFSoftCosineQA

pretrained = '/home/tcastrof/BeteQA/live/neuralmind/bert-large-portuguese-cased'
model = TFIDFSoftCosineQA(corpora_path, pretrained, pretrained)

softcosine = []
MAP = []
for q1 in testset:
  y_real, y_pred = [], []
  for row in testset[q1]:
    q2 = row['question']
    label = row['label']
    score = model.score_pair(q1, q2)
    y_real.append(1 if label in ['S', 'R'] else 0)
    y_pred.append(score)

    softcosine.append({
        'qid': i+1,
        'class': 2 if label == 'S' else 1 if label == 'R' else 0,
        'score': score,
        'q': q1,
        'candidate': q2
    })

    if len(list(set(y_real))) > 1:
      MAP.append(average_precision_score(y_real, y_pred))
print("SoftCosine with BERT Score:", round(np.mean(MAP), 2))

SoftCosine with BERT Score: 0.7


## BERT

In [8]:
from models.bert import BERTQA

pretrained = '/home/tcastrof/BeteQA/live/neuralmind/bert-large-portuguese-cased'
model = BERTQA(corpora_path, pretrained, pretrained)

bert = []
MAP = []
for q1 in testset:
  y_real, y_pred = [], []
  for row in testset[q1]:
    q2 = row['question']
    label = row['label']
    score = model.score_pair(q1, q2)
    y_real.append(1 if label in ['S', 'R'] else 0)
    y_pred.append(score)
    
    bert.append({
        'qid': i+1,
        'class': 2 if label == 'S' else 1 if label == 'R' else 0,
        'score': score,
        'q': q1,
        'candidate': q2
    })
    
    if len(list(set(y_real))) > 1:
      MAP.append(average_precision_score(y_real, y_pred))
print("BERT Score:", round(np.mean(MAP), 2))

BERT Score: 0.82


## MUSE

In [13]:
from models.muse import MUSEQA

model = MUSEQA(corpora_path, '/home/tcastrof/BeteQA/live/muse_large')

muse = []
MAP = []
for q1 in testset:
  y_real, y_pred = [], []
  for row in testset[q1]:
    q2 = row['question']
    label = row['label']
    score = model.score_pair(q1, q2)
    y_real.append(1 if label in ['S', 'R'] else 0)
    y_pred.append(score)
    
    muse.append({
        'qid': i+1,
        'class': 2 if label == 'S' else 1 if label == 'R' else 0,
        'score': score,
        'q': q1,
        'candidate': q2
    })
    
    if len(list(set(y_real))) > 1:
      MAP.append(average_precision_score(y_real, y_pred))
print("MUSE Score:", round(np.mean(MAP), 2))

INFO:absl:resolver HttpCompressedFileResolver does not support the provided handle.
INFO:absl:resolver GcsCompressedFileResolver does not support the provided handle.


MUSE Score: 0.86


In [16]:
import json

inps = []
for i, row in enumerate(bm25):
    inps.append({
            'qid': row['qid'],
            'user_question': row['q'],
            'candidate_question': row['candidate'],
            'class': row['class'],
            'features':{
                'bm25': float(bm25[i]['score']),
                'cosine': float(tfidf[i]['score']),
                'softcosine': float(softcosine[i]['score']),
                'bert': float(bert[i]['score']),
                'muse': float(muse[i]['score'])
            }
        })

json.dump(inps, open('features.json', 'w'), separators=(',', ':'), indent=4)
inps

[{'candidate_question': 'P: O que é cetoacidose e quais são os sintomas?',
  'class': 1,
  'features': {'bert': 0.835228681564331,
   'bm25': 1.4439651447028046,
   'cosine': 0.034445135069625606,
   'muse': 0.3695560097694397,
   'softcosine': 2.038156970661842},
  'qid': 1,
  'user_question': 'P: Nos últimos três dias, a respiração do meu marido cheirava fortemente ácida. Isso está relacionado ao diabetes dele?'},
 {'candidate_question': 'P: O diabetes pode aumentar os níveis de álcool no sangue?',
  'class': 0,
  'features': {'bert': 0.8718256950378418,
   'bm25': 2.1754401797214653,
   'cosine': 0.0466609857594224,
   'muse': 0.46813875436782837,
   'softcosine': 2.6553348310586182},
  'qid': 2,
  'user_question': 'P: Nos últimos três dias, a respiração do meu marido cheirava fortemente ácida. Isso está relacionado ao diabetes dele?'},
 {'candidate_question': 'Quais são os sinais da cetoacidose?',
  'class': 1,
  'features': {'bert': 0.8240658640861511,
   'bm25': 0.0,
   'cosine':

# Keep Pre-training BERT in WhatsApp Messages

In [None]:
import json
import os

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining, AutoModelForMaskedLM  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads

Load Corpus

In [None]:
CORPORA_PATH = [
        'QA/adalberto/qa_pt.json',
        'QA/diabetes_action/qa_pt.json',
        'QA/diabetesbr/qa_pt.json',
        'QA/eatingwell/qa_pt.json',
        'QA/medicine/qa_pt.json'
    ]

texts = []
for path in CORPORA_PATH:
    corpus = json.load(open(path))
    for qa in corpus:
      texts.append(qa['question'])
      texts.append(qa['answer'])

In [None]:
import codecs
import re

def is_start(line):
    if re.match(r'^\d+/\d+/\d+', line):
        return True
    return False

messages, msg = [], []
for fname in [w for w in os.listdir('QA/whatsapp') if not w.startswith('.')]:
    print(fname)
    with codecs.open(os.path.join('QA/whatsapp', fname), "r", "utf-8-sig") as f:
        for line in f.read().split('\n')[1:]:
            if is_start(line):
                if len(msg) > 0:
                    messages.append({
                        'author': author,
                        'date': date,
                        'msg': '\n'.join(msg)
                    })
                msg = []
                try:
                    date, rest = line.split(' - ')
                    author, paragraph = rest.split(':')
                    msg.append(paragraph)
                except:
                    pass
            else:
                msg.append(line)

texts.extend([w['msg'].strip() for w in messages if w['msg'].strip() != '<Media omitted>' and w['msg'].strip() != '<Arquivo de mídia oculto>'])
print('CORPUS SIZE:', len(texts))

with open('corpus.txt', 'w') as f:
  f.write('\n'.join(texts))

Sosdiabetes20201001.txt
Diabeticados3.txt
PessoinhasDoBem20200911.txt
Diabeticados320201001.txt
ContraDiabetes20201109.txt
SosDiabetes20201112.txt
PessoinhasdoBem20200924.txt
PessoinhasDoBem20201001.txt
SosDiabetes20200924.txt
PessoinhasDoBem20201212.txt
Diabeticados320200924.txt
ContraDiabetes20201001.txt
Diabeticados320200911.txt
Diabeticados320201212.txt
Sosdiabetes20200911.txt
CORPUS SIZE: 17071


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)
model = AutoModelForMaskedLM.from_pretrained('neuralmind/bert-large-portuguese-cased')
# model = model.to(device)

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,    
    file_path="./corpus.txt",
    block_size=128,
)



In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./diabert",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)



In [None]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,0.505189
1000,0.375761
1500,0.3695


TrainOutput(global_step=1671, training_loss=0.4133181340818559)

In [None]:
trainer.save_model("./diabert")

In [None]:
from models.bert import BERTQA

model = BERTQA(corpora_path, pretrained='diabert')

MAP = []
for q1 in testset:
  y_real, y_pred = [], []
  for row in testset[q1]:
    q2 = row['question']
    label = row['label']
    score = model.score_pair(q1, q2)
    y_real.append(1 if label in ['S', 'R'] else 0)
    y_pred.append(score)
    if len(list(set(y_real))) > 1:
      MAP.append(average_precision_score(y_real, y_pred))
print("BERT Finetunned Score:", round(np.mean(MAP), 2))

Some weights of BertModel were not initialized from the model checkpoint at diabert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Finetunned Score: 0.81


In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./diabert",
    tokenizer=tokenizer
)

Some weights of BertModel were not initialized from the model checkpoint at ./diabert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
fill_mask("Retinopatia [MASK]")

[{'score': 0.2595193684101105,
  'sequence': '[CLS] Retinopatia : [SEP]',
  'token': 131,
  'token_str': ':'},
 {'score': 0.1802317500114441,
  'sequence': '[CLS] Retinopatia? [SEP]',
  'token': 136,
  'token_str': '?'},
 {'score': 0.1395806521177292,
  'sequence': '[CLS] Retinopatia. [SEP]',
  'token': 119,
  'token_str': '.'},
 {'score': 0.056920938193798065,
  'sequence': '[CLS] Retinopatia crônica [SEP]',
  'token': 16785,
  'token_str': 'crônica'},
 {'score': 0.055065009742975235,
  'sequence': '[CLS] Retinopatias [SEP]',
  'token': 22281,
  'token_str': '##s'}]