!pip install -U sentence-transformers
!pip install biobert-embedding

## Pipeline
* | Biobert |-> SBERT
* trained on  MultiNLI, MEDNLI, ManConCorp, Our Annotation
* maybe replacing multinli with stanford nli, or using both

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('Data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


import os
import sys
import tensorflow as tf
from biobert_embedding import downloader
from biobert_embedding.embedding import BiobertEmbedding
from sentence_transformers import SentenceTransformer,models

import torch
from torch.utils.data import DataLoader


from tqdm import tqdm

import math

Data/snli_1.0_train.csv
Data/MultiNLI_cleaned.csv
Data/manconcorpus_sent_pairs.tsv


In [2]:
model_path = downloader.get_BioBert("google drive")
## downloading biobert

In [3]:
text="the recombinant protein reported here, together with the detailed structural information, might also be useful to others developing sars-cov-2 diagnostics and/or therapeutics."

biobert = BiobertEmbedding(model_path)

word_embeddings = biobert.word_vector(text)
sentence_embedding = biobert.sentence_vector(text)

print("Text Tokens: ", biobert.tokens)
#Text Tokens:  ['the', 'recombinant', 'protein', 'reported', 'here', ',', 'together', 'with', 'the', 'detailed', 'structural', 'information', ',', 'might', 'also', 'be', 'useful', 'to', 'others', 'developing', 'sars', '-', 'cov', '-', '2', 'diagnostics', 'and', '/', 'or', 'therapeutics', '.']

print ('Shape of Word Embeddings: %d x %d' % (len(word_embeddings), len(word_embeddings[0])))
# Shape of Word Embeddings: 31 x 768

print("Shape of Sentence Embedding = ",len(sentence_embedding))
# Shape of Sentence Embedding =  768

Text Tokens:  ['the', 'recombinant', 'protein', 'reported', 'here', ',', 'together', 'with', 'the', 'detailed', 'structural', 'information', ',', 'might', 'also', 'be', 'useful', 'to', 'others', 'developing', 'sars', '-', 'cov', '-', '2', 'diagnostics', 'and', '/', 'or', 'therapeutics', '.']
Shape of Word Embeddings: 31 x 768
Shape of Sentence Embedding =  768


In [4]:
# Use BERT for mapping tokens to embeddings
from sentence_transformers import models,losses
from sentence_transformers import SentenceTransformer
word_embedding_model = models.BERT('./'+model_path.name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=True,
                               pooling_mode_max_tokens=True)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) ## generating biobert sentence embeddings (mean pooling of sentence embedding vectors)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sentence_transformers import SentenceTransformer,SentencesDataset

sentence_embeddings = model.encode([text])
print("Shape of Sentence Embedding = ",len(sentence_embedding))

Batches: 100%|██████████| 1/1 [00:00<00:00, 49.79it/s]

Shape of Sentence Embedding =  768





In [7]:
df_mancon=pd.read_csv("Data/manconcorpus_sent_pairs.tsv",sep="\t").rename(columns={"guid":"pairID","text_a":"sentence1",
                                                                                "text_b":"sentence2"}) ## manconcorp

df_snli=pd.read_csv("Data/snli_1.0_train.csv") ## stanford nli

df_multinli=pd.read_csv("Data/MultiNLI_cleaned.csv").drop("Unnamed: 0",axis=1)


In [8]:
df_mancon.columns

Index(['label', 'pairID', 'sentence1', 'sentence2'], dtype='object')

In [9]:
df_nli=pd.concat([df_multinli[['gold_label','sentence1','sentence2','pairID']],
                    df_snli[['gold_label','sentence1','sentence2','pairID']]]).rename(columns={"gold_label":"label"})
## this has snli+multinli
df_nli=df_nli[df_nli['label']!="-"]
df_nli=df_nli.dropna(how="any").reset_index(drop=True) ## removing rows with null values

In [10]:
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [11]:
class NLIDataReader(object):
    def __init__(self,dataframe):
        self.df=dataframe.copy()
    def get_examples(self,max_examples=0):
        s1=self.df["sentence1"].values
        s2=self.df["sentence2"].values
        labels=self.df["label"].values
        guid=self.df["pairID"].values
        examples = []
        for sentence_a, sentence_b, label, guid_id in zip(s1, s2, labels, guid):

            examples.append(InputExample(guid=guid_id, texts=[sentence_a, sentence_b], label=self.map_label(label)))

            if 0 < max_examples <= len(examples):
                break

        return examples
    
    
    
    @staticmethod
    def get_labels():
        return {"contradiction": 0, "entailment": 1, "neutral": 2}

    def get_num_labels(self):
        return len(self.get_labels())

    def map_label(self, label):
        return self.get_labels()[label.strip().lower()]

In [12]:
df_nlitrain,df_nlitest=train_test_split(df_nli,test_size=0.2)
df_nlitest,df_nlival=train_test_split(df_nlitest,test_size=0.5)

df_mancontrain,df_mancontest=train_test_split(df_mancon,test_size=0.2)
df_mancontest,df_manconval=train_test_split(df_mancontest,test_size=0.5)

In [13]:
## https://github.com/UKPLab/sentence-transformers specified here NLI training

nli_reader=NLIDataReader(df_nlitrain)
train_num_labels = nli_reader.get_num_labels()
batch_size=32

train_data = SentencesDataset(nli_reader.get_examples(), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)


val_nli_reader=NLIDataReader(df_nlival)

dev_data = SentencesDataset(val_nli_reader.get_examples(), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

Convert dataset: 100%|██████████| 753618/753618 [05:42<00:00, 2202.19it/s]
Convert dataset: 100%|██████████| 94203/94203 [00:43<00:00, 2164.08it/s]


In [14]:
model_save_path="model_mnli/"
num_epochs=1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

ValueError: Output directory (model_mnli/) already exists and is not empty.

In [15]:
model = SentenceTransformer(model_save_path)

test_nli_reader=NLIDataReader(df_nlitest)
test_data = SentencesDataset(test_nli_reader.get_examples(), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

Convert dataset: 100%|██████████| 94202/94202 [00:44<00:00, 2124.02it/s]


In [16]:
model.evaluate(evaluator)

Convert Evaluating: 100%|██████████| 2944/2944 [03:06<00:00, 15.77it/s]


0.27359250939614416

#### testing on mancon

In [17]:
test_mancon_reader=NLIDataReader(df_mancontest)
test_data = SentencesDataset(test_mancon_reader.get_examples(), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

Convert dataset: 100%|██████████| 1791/1791 [00:01<00:00, 935.19it/s]


In [18]:
model.evaluate(evaluator)

Convert Evaluating: 100%|██████████| 56/56 [00:05<00:00,  9.59it/s]


-0.19415268509835623

#### training on mancon, train set

In [19]:
model = SentenceTransformer(model_save_path)
mancon_reader=NLIDataReader(df_mancontrain)
train_num_labels = mancon_reader.get_num_labels()
batch_size=32

train_data = SentencesDataset(mancon_reader.get_examples(), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)


val_mancon_reader=NLIDataReader(df_manconval)

dev_data = SentencesDataset(val_mancon_reader.get_examples(), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

Convert dataset: 100%|██████████| 14328/14328 [00:15<00:00, 939.99it/s]
Convert dataset: 100%|██████████| 1792/1792 [00:01<00:00, 953.15it/s]


In [20]:
model_save_path="model_mnli_mancon/"
num_epochs=1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

ValueError: Output directory (model_mnli_mancon/) already exists and is not empty.

In [21]:
model = SentenceTransformer(model_save_path)

test_mancon_reader=NLIDataReader(df_mancontest)
test_data = SentencesDataset(test_mancon_reader.get_examples(), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

Convert dataset: 100%|██████████| 1791/1791 [00:01<00:00, 948.93it/s]


In [22]:
model.evaluate(evaluator)

Convert Evaluating: 100%|██████████| 56/56 [00:05<00:00,  9.61it/s]


-0.3187323327030114

In [23]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 05:09:14
