In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F

In [1]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F

# Load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

# # Shuffle and select a subset of the data
# full_dataset = ds['train'].shuffle(seed=42).select(range(11000))

# Select the first 15000 rows of the dataset
full_dataset = ds['train'].select(range(15000))

# Split the data
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

# Extract unique outputs to use as labels
unique_labels = list(set(train_dataset['output'] + val_dataset['output'] + test_dataset['output']))

# Create a label mapping
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Function to map text labels to numeric labels
def map_labels(example):
    example['label'] = label_mapping[example['output']]
    return example

# Apply the label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenization function
def tokenize_function(examples):
    inputs = [f"{inst} {inp}" for inst, inp in zip(examples['instruction'], examples['input'])]
    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define model
num_labels = len(label_mapping)
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define a custom Trainer to calculate loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        if labels is None:
            raise ValueError("Labels should not be None")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Function to generate response category
def predict_category(instruction, input_text):
    inputs = tokenizer(f"{instruction} {input_text}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    predicted_label = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class_id)]
    return predicted_label

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a headache and fever. What should I do?"
predicted_label = predict_category(sample_instruction, sample_input)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"Predicted Response: {predicted_label}")


Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]



Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism ha

Epoch,Training Loss,Validation Loss
1,9.3034,9.551639
2,9.302,9.971188
3,9.1596,11.001689


Evaluation results: {'eval_loss': 9.55163860321045, 'eval_runtime': 58.0593, 'eval_samples_per_second': 18.946, 'eval_steps_per_second': 2.377, 'epoch': 3.0}
Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: I have a headache and fever. What should I do?
Predicted Response: Hi... Thank you for consulting in Chat Doctor. Skin conditions are best diagnosed only after seeing directly. I suggest you to upload photographs of the same on this website, so that I can guide you scientifically.  Hope my answer was helpful for you.  I am happy to help any time. Further clarifications and consultations on Chat Doctor are welcome. If you do not have any clarifications, you can close the discussion and rate the answer. Wish your kid good health.
