<a href="https://colab.research.google.com/github/GAYATRI-SIVANI-SUSARLA/GenAI_Beginner-Guide/blob/main/Copy_of_MedicalBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers datasets scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Download and Load the Dataset
DATA_PATH = "/content/drive/MyDrive/PubMed_20k_RCT"

def load_data(file_path):
    texts, labels = [], []
    with open(file_path, "r") as file:
        for line in file:
            if line.startswith("###"):
                continue
            if line.strip():
                label, text = line.split("\t", 1)
                texts.append(text.strip())
                labels.append(label.strip())
    return pd.DataFrame({"text": texts, "label": labels})

train_df = load_data(os.path.join(DATA_PATH, "train.txt"))
test_df = load_data(os.path.join(DATA_PATH, "test.txt"))

label_map = {label: i for i, label in enumerate(train_df["label"].unique())}
train_df["label"] = train_df["label"].map(label_map)
test_df["label"] = test_df["label"].map(label_map)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"], train_df["label"], test_size=0.1, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_dataset = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_df["text"].tolist(), "label": test_df["label"].tolist()})

datasets = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# Step 2: Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Step 3: Load Pre-trained Model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_map))

# Step 4: Define Training Arguments (Optimized for Colab)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Smaller batch size for Colab
    per_device_eval_batch_size=4,
    num_train_epochs=2,  # Reduce epochs for faster training
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,  # Enable mixed precision for faster training
    report_to="none",
)

# Step 5: Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Step 6: Train Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# Step 7: Evaluate and Save the Model
results = trainer.evaluate()
print("Evaluation results:", results)

model.save_pretrained("./pubmed_distilbert_model")
tokenizer.save_pretrained("./pubmed_distilbert_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/26543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2950 [00:00<?, ? examples/s]

Map:   0%|          | 0/29493 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.582,0.510077,0.856949,0.856383,0.856949,0.855986
2,0.4204,0.537505,0.868814,0.868934,0.868814,0.867903


Evaluation results: {'eval_loss': 0.5375049710273743, 'eval_accuracy': 0.8688135593220339, 'eval_precision': 0.8689344098520851, 'eval_recall': 0.8688135593220339, 'eval_f1': 0.8679026200723976, 'eval_runtime': 5.9338, 'eval_samples_per_second': 497.152, 'eval_steps_per_second': 124.372, 'epoch': 2.0}


('./pubmed_distilbert_model/tokenizer_config.json',
 './pubmed_distilbert_model/special_tokens_map.json',
 './pubmed_distilbert_model/vocab.txt',
 './pubmed_distilbert_model/added_tokens.json',
 './pubmed_distilbert_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = "/content/pubmed_distilbert_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define the label mapping (same as during training)
label_map = {0: "BACKGROUND", 1: "OBJECTIVE", 2: "METHODS", 3: "RESULTS", 4: "CONCLUSIONS"}

# Function to classify a single input text
def classify_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Move tensors to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Map class ID to label
    predicted_label = label_map[predicted_class_id]
    return predicted_label

# Test the function with a sample input
sample_text = "The purpose of this study was to evaluate the effectiveness of the new drug."
predicted_label = classify_text(sample_text)

print(f"Input Text: {sample_text}")
print(f"Predicted Label: {predicted_label}")


Input Text: The purpose of this study was to evaluate the effectiveness of the new drug.
Predicted Label: CONCLUSIONS
