In [None]:
!pip install transformers datasets evaluate sentencepiece sacremoses --quiet --upgrade

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does n

In [None]:
import re
import gc
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer


KeyboardInterrupt: 

In [None]:

textbook_url = "https://huggingface.co/datasets/zxvix/MedicalTextbook/resolve/main/Anatomy_Gray/train-00000-of-00001.parquet"
df_textbook = pd.read_parquet(textbook_url)

disease_url = "https://huggingface.co/datasets/ares1123/disease_symtoms/resolve/main/disease_sympts_prec_full.csv"
df_disease = pd.read_csv(disease_url)

dataset_textbook = Dataset.from_pandas(df_textbook)
dataset_disease = Dataset.from_pandas(df_disease)


In [None]:
# 🔹 Clean textbook entries
def clean_textbook_entry(example):
    text = example.get("text", "")
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\(.*?\)", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return {"text": text}

textbook_cleaned = dataset_textbook.map(clean_textbook_entry, remove_columns=dataset_textbook.column_names)



In [None]:
# 🔹 Format disease entries
def format_disease_entry(example):
    disease = example.get("Disease", "").strip()
    symptoms = example.get("Symptoms", "")
    precautions = example.get("Precautions", "")
    symptoms = ", ".join(symptoms) if isinstance(symptoms, list) else str(symptoms)
    precautions = ", ".join(precautions) if isinstance(precautions, list) else str(precautions)
    return {
        "text": f"Disease: {disease}. Symptoms include: {symptoms}. Precautions include: {precautions}."
    }

disease_formatted = dataset_disease.map(format_disease_entry, remove_columns=dataset_disease.column_names)



In [None]:
#  Combine datasets
combined_texts = textbook_cleaned["text"] + disease_formatted["text"]
combined_dataset = Dataset.from_list([{"text": t} for t in combined_texts])


In [None]:
#  Tokenization
model_name = "microsoft/BioGPT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"][:]
    return tokens

tokenized_dataset = combined_dataset.map(tokenize, batched=True, remove_columns=["text"])


# 🔹 Load model
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# 🔹 Training Arguments
training_args = TrainingArguments(
    output_dir="./biogpt-medical-autocomplete",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)


In [None]:
# 🔹 Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)


trainer.train()


In [None]:
# 🔹 Save
trainer.save_model("./biogpt-medical-autocomplete")
tokenizer.save_pretrained("./biogpt-medical-autocomplete")

# 🔹 Cleanup
del model, tokenized_dataset, combined_dataset
gc.collect()


In [None]:
!pip install huggingface_hub
from huggingface_hub import login
login('')  # paste your HF token

from transformers import AutoModelForCausalLM

AutoModelForCausalLM.from_pretrained("biogpt-medical-autocomplete").push_to_hub("ezhdeha/biogpt-medical-autocomplete")


In [None]:
from transformers import AutoTokenizer

model_dir = "./biogpt-medical-autocomplete"
AutoTokenizer.from_pretrained(model_dir).push_to_hub("ezhdeha/biogpt-medical-autocomplete")
