In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [2]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Load training and testing datasets
df = pd.read_csv("/content/symptom_train.csv")
df_test = pd.read_csv("/content/symptom_test.csv")

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Data Preprocessing: prepare Huggingface Dataset objects
# Rename columns to 'text' and 'label' for Huggingface compatibility
train_dataset = Dataset.from_pandas(df[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))
test_dataset = Dataset.from_pandas(df_test[["Abstract", "MentionsSymptom"]].rename(columns={"Abstract": "text", "MentionsSymptom": "label"}))

# Tokenize the datasets
train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True)

# Define label map and load model
id2label = {0: "No Symptom", 1: "Mentions Symptom"}
label2id = {"No Symptom": 0, "Mentions Symptom": 1}

# Load DistilBERT model for sequence classification
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2, # Binary classification (0 or 1)
    id2label=id2label,
    label2id=label2id
)


# Set training arguments
from transformers import TrainingArguments


from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, Trainer)

training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",                      # run eval at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
    fp16=True  # # Use FP16 (faster on GPUs) with Colab
)


# Define custom evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    auc = roc_auc_score(labels, pred.predictions[:, 1])
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],        # Overall accuracy
        "precision": report["1"]["precision"], # Precision for class '1' (Mentions Symptom)
        "recall": report["1"]["recall"],       # Recall for class '1'
        "f1": report["1"]["f1-score"],         # F1-score for class '1'
        "roc_auc": auc                         # ROC-AUC score
    }


# Initialize Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train() # Start training
trainer.evaluate() # Final evaluation on the test set


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/6727 [00:00<?, ? examples/s]

Map:   0%|          | 0/1442 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,No log,0.071302,0.98613,1.0,0.936508,0.967213,0.979079
2,0.150400,0.09271,0.978502,0.964052,0.936508,0.950081,0.984147
3,0.057500,0.068035,0.98613,0.993311,0.942857,0.967427,0.987872
4,0.049000,0.072607,0.986824,0.977419,0.961905,0.9696,0.987334


{'eval_loss': 0.06803501397371292,
 'eval_accuracy': 0.986130374479889,
 'eval_precision': 0.9933110367892977,
 'eval_recall': 0.9428571428571428,
 'eval_f1': 0.9674267100977199,
 'eval_roc_auc': 0.9878720018027916,
 'eval_runtime': 5.8504,
 'eval_samples_per_second': 246.48,
 'eval_steps_per_second': 15.555,
 'epoch': 4.0}

In [3]:
trainer.save_model("./text_classification_model")
tokenizer.save_pretrained("./text_classification_model")  # store tokenizer


('./text_classification_model/tokenizer_config.json',
 './text_classification_model/special_tokens_map.json',
 './text_classification_model/vocab.txt',
 './text_classification_model/added_tokens.json',
 './text_classification_model/tokenizer.json')

In [4]:
import os
print(os.listdir("./text_classification_model"))


['checkpoint-1684', 'model.safetensors', 'vocab.txt', 'checkpoint-842', 'tokenizer_config.json', 'special_tokens_map.json', 'checkpoint-1263', 'config.json', 'training_args.bin', 'checkpoint-421', 'tokenizer.json']


In [6]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load the symptom_test.csv dataset
df = pd.read_csv("/content/symptom_test.csv")
# Randomly sample 20 abstracts from the dataset
# Setting random_state ensures reproducibility
sample_df = df.sample(n=20, random_state=42)
sample_texts = sample_df["Abstract"].tolist()
sample_labels = sample_df["MentionsSymptom"].tolist()

# Load tokenizer and model from the saved checkpoint folder
tokenizer = DistilBertTokenizerFast.from_pretrained("./text_classification_model")
model = DistilBertForSequenceClassification.from_pretrained("./text_classification_model")

model.eval()

# Apply tokenizer with padding and truncation, return PyTorch tensors
inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt")

# Disable gradient computation for faster inference and reduced memory usage
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()

# Create a DataFrame combining abstracts, true labels, and DistilBERT predictions
results_df = pd.DataFrame({
    "Abstract Snippet": sample_texts,
    "True Label (MentionsSymptom)": sample_labels,
    "DistilBERT Prediction": preds
})

# Set pandas option to display long text fields without truncation
pd.set_option('display.max_colwidth', 200)
print(results_df)



                                                                                                                                                                                           Abstract Snippet  \
0   Amyloid precursor protein (APP) plays a central role in the pathophysiology of Alzheimer's disease (AD). The accumulation of beta-amyloid protein is believed to be a crucial step in the developmen...   
1   Aging and Alzheimer's disease (AD) exhibit sex differences in several biological processes, including demyelination. In a recent study, Lopez-Lee et al. uncover the contributions of sex chromosome...   
2   Shift work, the proven circadian rhythm-disrupting behavior, has been linked to the increased risk of Alzheimer's disease (AD). However, the putative causal effect and potential mechanisms of shif...   
3   The gut-brain axis has emerged as a key player in the regulation of brain function and cognitive health. Gut microbiota dysbiosis has been observed in preclinical model