In [None]:
pip install transformers datasets torchaudio librosa jiwer accelerate

In [None]:
import torch
import librosa
import torchaudio
from datasets import load_dataset, load_metric
from transformers import WhisperForConditionalGeneration, WhisperProcessor, TrainingArguments, Trainer

# Load Whisper model and processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Load custom dataset (modify path accordingly)
dataset = load_dataset("path_to_medical_audio_dataset")

# Preprocess the dataset
def preprocess_data(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_features"] = input_features
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    return batch

dataset = dataset.map(preprocess_data, remove_columns=["audio", "transcription"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./whisper-medical",
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=processor.feature_extractor,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./whisper-medical")
processor.save_pretrained("./whisper-medical")

In [None]:
pip install transformers datasets accelerate

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer

# Load pre-trained GPT model (can use Llama-2, GPT-J, or GPT-3.5-like models)
model_name = "EleutherAI/gpt-j-6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Load the medical dataset (Modify path)
dataset = load_dataset("path_to_medical_text_dataset")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["transcription"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt-medical",
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./gpt-medical")
tokenizer.save_pretrained("./gpt-medical")