In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F

# Load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

# Shuffle and select a subset of the data
full_dataset = ds['train'].shuffle(seed=42).select(range(11000))

# Split the data
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

# Load the Bio_ClinicalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenization function
def tokenize_function(examples):
    inputs = [f"{inst} {inp}" for inst, inp in zip(examples['instruction'], examples['input'])]
    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    # Here we assume 'output' contains class labels for sequence classification
    tokenized_inputs["labels"] = examples["output"]
    return tokenized_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define model
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=len(set(tokenized_train['labels'])))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define a custom Trainer to calculate loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Function to generate response
def generate_response(instruction, input_text):
    inputs = tokenizer(instruction + " " + input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    response = tokenizer.decode([predicted_class_id], skip_special_tokens=True)
    return response

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a headache and fever. What should I do?"
response = generate_response(sample_instruction, sample_input)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"Response: {response}")


In [1]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

# Shuffle and select a subset of the data
full_dataset = ds['train'].shuffle(seed=42).select(range(11000))

# Split the data
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

# Load the T5 tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenization function
def tokenize_function(examples):
    inputs = [f"{inst} {inp}" for inst, inp in zip(examples['instruction'], examples['input'])]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=512)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Function to generate response
def generate_response(instruction, input_text):
    inputs = tokenizer(f"{instruction} {input_text}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a headache and fever. What should I do?"
response = generate_response(sample_instruction, sample_input)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"Response: {response}")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]



Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,1.1069,1.006738
2,1.0528,0.972825
3,1.0342,0.96428


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation results: {'eval_loss': 0.9642802476882935, 'eval_runtime': 59.5533, 'eval_samples_per_second': 18.471, 'eval_steps_per_second': 2.317, 'epoch': 3.0}
Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: I have a headache and fever. What should I do?
Response: Hi, Welcome to Chat Doctor. I have gone through your query. I can understand your concern. I have gone through your query. I have gone through your query. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I can understand your concern. I will be happy to answer your query. I will be happy to help you further.


In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

# Shuffle and select a subset of the data
full_dataset = ds['train'].shuffle(seed=42).select(range(11000))

# Split the data
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

# Load the T5 tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Tokenization function
def tokenize_function(examples):
    inputs = [f"{inst} {inp}" for inst, inp in zip(examples['instruction'], examples['input'])]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=512)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define model
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Increased epochs for better training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=50,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Function to generate response
def generate_response(instruction, input_text):
    inputs = tokenizer(f"{instruction} {input_text}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)  # Reduced max_length for response generation
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a headache and fever. What should I do?"
response = generate_response(sample_instruction, sample_input)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"Response: {response}")




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
