In [1]:
# Clone the repo you want to start from
!git clone https://dagshub.com/Dean/BioBERT-DAGsHub.git

Cloning into 'BioBERT-DAGsHub'...
remote: Enumerating objects: 366, done.[K
remote: Counting objects: 100% (366/366), done.[K
remote: Compressing objects: 100% (308/308), done.[K
remote: Total 366 (delta 110), reused 198 (delta 37)[K
Receiving objects: 100% (366/366), 1.98 MiB | 17.35 MiB/s, done.
Resolving deltas: 100% (110/110), done.


# Imports

In [2]:
import os
from getpass import getpass

# Set up your DAGsHub & Git Credentials

In [3]:
os.environ['DAGS_REPO_OWNER'] = input('Enter the name of the repo owner: ')
os.environ['DAGS_REPO'] = input(
    'Enter the repository name for the project you will be working on: ')
os.environ['DAGS_USER'] = input('Enter the username of your DAGsHub account: ')
os.environ['EMAIL'] = input('Enter the email for your DAGsHub account: ')

Enter the name of the repo owner: Dean
Enter the repository name for the project you will be working on: BioBERT-DAGsHub
Enter the username of your DAGsHub account: Dean
Enter the email for your DAGsHub account: dean@dagshub.com


In [4]:
!git config --global user.email "$EMAIL"
!git config --global user.name "$DAGS_USER"

# Project Setup
## Installing Requirements

In [5]:
%cd BioBERT-DAGsHub/

/content/BioBERT-DAGsHub


In [None]:
!pip install -r requirements.txt

# Playground

## Prepare branch
Checkout to your branch

In [7]:
os.environ["GIT_BRANCH"] = input("insert the branch you are working on: ")

insert the branch you are working on: master


In [None]:
!git fetch
!git checkout $GIT_BRANCH
!git pull
!pip install -r requirements.txt
!dvc pull -r origin

In [None]:
!git status

Set params for the experiments, and save them for traceability and reproduceability of the experiment

In [19]:
# Set params
os.environ['DATA_DIR'] = "preprocessed_datasets/NER"
os.environ['ENTITY'] = "NCBI-disease"

import yaml

params = {
  "model_name_or_path": "dmis-lab/biobert-base-cased-v1.1",
  "data_dir": os.path.join(os.environ["DATA_DIR"], os.environ["ENTITY"]),
  "labels": os.path.join(os.environ["DATA_DIR"], os.environ["ENTITY"], "labels.txt"),
  "output_dir": os.path.join("output", os.environ["ENTITY"]),
  "max_seq_length": 128,
  "num_train_epochs": 1,
  "per_device_train_batch_size": 32,
  "save_steps": 1000,
  "seed": 2,
  "do_train": True,
  "do_eval": True,
  "do_predict": True,
  "overwrite_output_dir": True,
}

with open("params.yaml", "w") as f:
  yaml.dump(params, f, default_flow_style=False)


## Finetune (Train) the Model for the NER task

In [None]:
!python named-entity-recognition/run_ner.py params.yaml

## Try out the model after finetuning

In [11]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoModel,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
import importlib  
ner = importlib.import_module("named-entity-recognition.utils_ner")

In [22]:
params['fine_tuned_model_path'] = "./output/NCBI-disease/pytorch_model.bin"
labels = ner.get_labels(params['labels'])
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
num_labels = len(labels)
config = AutoConfig.from_pretrained(
        params['output_dir'] + "/config.json",
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
)

tokenizer = AutoTokenizer.from_pretrained(
    params['model_name_or_path'],
    use_fast=False,
)

model = AutoModelForTokenClassification.from_pretrained(
        params['fine_tuned_model_path'],
        config=config,
)

In [15]:
# The sequence to test on
sequence = "with B - cell non Hodgkins lymphomas"

# Bit of a hack to get the tokens with the special tokens
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")

outputs = model(inputs)[0]
predictions = torch.argmax(outputs, dim=2)

doc = [(token, labels[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]
print(doc)

[('[CLS]', 'O'), ('with', 'O'), ('b', 'B-bio'), ('-', 'I-bio'), ('cell', 'I-bio'), ('non', 'I-bio'), ('ho', 'I-bio'), ('##d', 'I-bio'), ('##g', 'I-bio'), ('##kins', 'I-bio'), ('l', 'I-bio'), ('##ymph', 'I-bio'), ('##oma', 'I-bio'), ('##s', 'O'), ('[SEP]', 'O')]


## If you have new data folders/files to add

In [None]:
!git status

In [None]:
!dvc status

In [None]:
# Adding a new file to be tracked 
!dvc add <data folder / file name>

# Committing Changes to Git and DVC

In [None]:
!dvc commit
!git add .

outputs ['output'] of stage: 'output.dvc' changed. Are you sure you want to commit it? [y/n] y
[0m

In [None]:
os.environ['COMMIT_MSG'] = input('Enter a commit message: ')
!git commit -m "$COMMIT_MSG"

## Pushing Code & Data to DAGsHub

In [None]:
# If this stage fails, make sure to remove outputs as it will show your password to whoever it is shared with.
os.environ['PASSWORD'] = getpass('Enter the password of your DAGsHub account: ')
os.environ['DAGSHUB_AUTH'] = os.environ['DAGS_USER'] + ':' + os.environ['PASSWORD']

!git push https://$DAGSHUB_AUTH@dagshub.com/$DAGS_REPO_OWNER/$DAGS_REPO.git $GIT_BRANCH

os.environ['PASSWORD'] = os.environ['DAGSHUB_AUTH'] = ""

Enter the password of your DAGsHub account: ··········
Counting objects: 6, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 615 bytes | 615.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0)
To https://dagshub.com/dean/biobert-dagshub.git
   cacedef..c5153ab  add-params -> add-params


In [None]:
!dvc remote modify --local origin auth basic
!dvc remote modify --local origin user $DAGS_USER
!dvc remote modify --local origin ask_password true

!dvc push -r origin