In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

In [2]:
mednli_path = "./nli_task"
mednli_bionlp = os.path.join(mednli_path,"mednli_bionlp")
mednli_clinical = os.path.join(mednli_path,"mednli_clinical")

In [3]:
# BIONLP
import pandas as pd    
nli_obj = pd.read_json(path_or_buf=os.path.join(mednli_bionlp,"mednli_bionlp19_shared_task.jsonl"), lines=True)
nli_labels = pd.read_csv(os.path.join(mednli_bionlp,"mednli_bionlp19_shared_task_ground_truth.csv"), index_col = None)

# Merging with labels
bionlp_nli = pd.merge(nli_obj, nli_labels, left_on = "pairID", right_on = "pair_id", how = "left")[["sentence1","sentence2","label"]]
bionlp_nli["data"] = "bionlp"

In [4]:
# clinical
import pandas as pd    
nli_dev_obj = pd.read_json(path_or_buf=os.path.join(mednli_clinical,"mli_dev_v1.jsonl"), lines=True)
nli_train_obj = pd.read_json(path_or_buf=os.path.join(mednli_clinical,"mli_train_v1.jsonl"), lines=True)

# merging training and dev set
clinical_nli = pd.concat([nli_train_obj,nli_dev_obj], axis = 0)[["sentence1","sentence2","gold_label"]]
clinical_nli.columns = ["sentence1","sentence2","label"]
clinical_nli["data"] = "clinical"

In [5]:
# Final dataset
med_nli = pd.concat([bionlp_nli,clinical_nli], axis = 0).reset_index(drop = True)

### Fine-Tuning
##### Code Adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli.py

In [22]:
model_name = "gsarti/covidbert-nli"
# Read the dataset
train_batch_size = 32
model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

In [23]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [27]:
# model._target_device = torch.device("cpu")

In [28]:
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []
for i,row in  med_nli.iterrows():
    label_id = label2int[row['label']]
    train_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=label_id))

In [29]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))

In [32]:
# Configure the training
num_epochs = 5

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )


In [38]:
model_pre_trained = SentenceTransformer("./output/training_nli_covidbert-mednli")

## Med-Marco Fine-Tuning

The medical subset of the MS-MARCO dataset was constructed for SLEDGE by filtering queries that have terms found in MedSyn, a lexicon of layman and expert medical terminology. The full list of MS-MARCO training IDs that match the filter are found here.

MedSyn :- http://ir.cs.georgetown.edu/downloads/ECIR2013-ADRTrace.pdf

Med-MSMArco :- https://github.com/Georgetown-IR-Lab/covid-neural-ir/blob/master/med-msmarco-train.txt

In [40]:
import gzip

In [61]:
msmarco_filepath = "./msmarco_task/"

In [62]:
# Medical Q-Ids
med_qids = []
with open(os.path.join(msmarco_filepath,"med-msmarco.txt")) as f:
    for line in f:
        med_qids.append(int(line.replace("\n","")))

In [63]:
n_rows = 100000

# create the iterator
msmarco_iterator = pd.read_csv(
    os.path.join(msmarco_filepath,"qidpidtriples.rnd-shuf.train.tsv"),
    iterator=True,
    chunksize=n_rows,
    sep = "\t", header = None)

In [64]:
med_msmarco = pd.concat([msmarco_chunk[msmarco_chunk.iloc[:,0].isin(med_qids)]
    for msmarco_chunk in msmarco_iterator])

In [66]:
med_msmarco.columns = ["qid", "pos_id", "neg_id"]

In [81]:
# med_msmarco2 = pd.read_csv(
#     os.path.join(msmarco_filepath,"qidpidtriples.rnd-shuf.train-eval.tsv"),
#     sep = "\t", header = None)

# med_msmarco2.columns = ["qid", "pos_id", "neg_id"]

# len(set(med_qids) - set(med_msmarco["qid"].tolist() + med_msmarco2["qid"].tolist()) )

In [77]:
med_msmarco.groupby(["qid","pos_id"]).head(4).reset_index().to_csv("medmarco.csv")