In [7]:
import pandas as pd

splits = {'train': 'ohsumed/train-00000-of-00001.parquet', 'test': 'ohsumed/test-00000-of-00001.parquet'}
training = pd.read_parquet("hf://datasets/community-datasets/ohsumed/" + splits["train"])
test = pd.read_parquet("hf://datasets/community-datasets/ohsumed/" + splits["test"])

'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 47f8d271-5f3a-4dd7-aa71-8fe5caa6623d)')' thrown while requesting GET https://huggingface.co/datasets/community-datasets/ohsumed/resolve/main/ohsumed/test-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].


Training BERT

In [8]:
training['title_abstract'] = training['title'] + ' ' + training['abstract']
test['title_abstract'] = test['title'] + ' ' + test['abstract']
training.head()
test.head()

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source,title_abstract
0,54711,88000001,Acetaldehyde/*ME; Buffers; Catalysis; HEPES/PD...,The binding of acetaldehyde to the active site...,JOURNAL ARTICLE.,"Ribonuclease A was reacted with [1-13C,1,2-14C...",Mauch TJ; Tuma DJ; Sorrell MF.,Alcohol Alcohol 8801; 22(2):103-12,The binding of acetaldehyde to the active site...
1,54711,88000002,"Adult; Alcohol, Ethyl/*AN; Breath Tests/*; Hum...",Reductions in breath ethanol readings in norma...,JOURNAL ARTICLE.,Blood ethanol concentrations were measured seq...,Gaylarde PM; Stambuk D; Morgan MY.,Alcohol Alcohol 8801; 22(2):113-6,Reductions in breath ethanol readings in norma...
2,54711,88000003,Alcoholism/*PP; Animal; Diprenorphine/PD; Fema...,Does the blockade of opioid receptors influenc...,JOURNAL ARTICLE.,We have tested whether the opioid antagonists ...,Kotlinska J; Langwinski R.,Alcohol Alcohol 8801; 22(2):117-9,Does the blockade of opioid receptors influenc...
3,54711,88000006,Adult; Alcohol Drinking/*PH; Alcoholism/*BL/CO...,Drinkwatchers--description of subjects and eva...,JOURNAL ARTICLE.,Clinical examination and measurement of MCV an...,Barrison IG; Ruzek J; Murray-Lyon IM.,Alcohol Alcohol 8801; 22(2):147-54,Drinkwatchers--description of subjects and eva...
4,54711,88000007,Adult; Alcoholism/*BL; Blood Platelets/*ME; Er...,Platelet affinity for serotonin is increased i...,JOURNAL ARTICLE.,The kinetics of 3H serotonin platelet uptake w...,Boismare F; Lhuintre JP; Daoust M; Moore N; Sa...,Alcohol Alcohol 8801; 22(2):155-9,Platelet affinity for serotonin is increased i...


In [9]:
def is_sensitive(mesh_terms):
    if isinstance(mesh_terms, str):
        return 1 if 'urogenital' in mesh_terms.lower() or 'pregnancy complications' in mesh_terms.lower() else 0
    else:
        return 0

training['label'] = training['mesh_terms'].apply(is_sensitive)
test['label'] = test['mesh_terms'].apply(is_sensitive)
training.head()
test.head()

In [10]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(training)
test_dataset = Dataset.from_pandas(test)

Tokenizing sentences

In [11]:
from transformers import DistilBertTokenizer

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset for BERTs training
def tokenize_function(examples):
    return tokenizer(examples["title_abstract"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["title_abstract"])
test_dataset = test_dataset.remove_columns(["title_abstract"])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

Map: 100%|██████████| 54709/54709 [03:48<00:00, 239.17 examples/s]
Map: 100%|██████████| 293855/293855 [18:16<00:00, 268.04 examples/s]


Loading DistilBERT

In [12]:
print(training.columns)

Index(['seq_id', 'medline_ui', 'mesh_terms', 'title', 'publication_type',
       'abstract', 'author', 'source', 'title_abstract', 'label'],
      dtype='object')


In [13]:
from transformers import DistilBertForSequenceClassification

# Load the model with the number of labels
num_labels = len(set(training["label"]))
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining trainer and finetuning BERT

In [15]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(eval_pred):
    # Extracting logits and labels from eval predictioons for metrics
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    # Computing metrics for comparison from paper
    precision = precision_score(labels, predictions, average="binary")
    recall = recall_score(labels, predictions, average="binary")
    f1 = f1_score(labels, predictions, average="binary")
    f2 = f1_score(labels, predictions, beta=2, average="binary")
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

# Defining Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


Training and evaluating

In [None]:
# Training Trainer BERTs training
trainer.train()

# Evaluating on test sett
trainer.evaluate()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: sergejs (sergejs-tu-wien). Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss
