In [None]:
!pip install -q -U transformers accelerate bitsandbytes peft datasets sentencepiece huggingface_hub
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from huggingface_hub import login
HF_TOKEN = "YOUR_TOKEN"
if HF_TOKEN:
    login(token=HF_TOKEN)
model_id = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")
print(f"Loaded dataset with {len(dataset)} examples")
def tokenize_function(examples):
    inputs = [
        q + "\nReasoning: " + cot
        for q, cot in zip(examples['Question'], examples['Complex_CoT'])
    ]
    outputs = examples['Response']
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            outputs,
            max_length=512,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("Dataset tokenized!")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
    output_dir="./gemma-uncensored-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    num_train_epochs=3,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    optim="paged_adamw_32bit",
    fp16=True,
    logging_steps=50,
    save_steps=1500,
    save_strategy="steps",
    logging_dir="./logs",
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./gemma--lora")
tokenizer.save_pretrained("./gemma--lora")
print("Fine-tuning complete! Model saved to ./gemma--lora")


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.1/60.1 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m506.3/506.3 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

‚úÖ Loaded dataset with 19704 examples


Map:   0%|          | 0/19704 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


‚úÖ Dataset tokenized!


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


üéâ Fine-tuning complete! Model saved to ./gemma-uncensored-lora


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch, os

base_model_id = "google/gemma-3-270m-it"
lora_dir = "./gemma--lora"
checkpoints = [os.path.join(lora_dir, d) for d in os.listdir(lora_dir) if d.startswith("checkpoint-")]
latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[-1] if checkpoints else lora_dir

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float32, device_map={"": "cpu"})

lora_model = PeftModel.from_pretrained(base_model, latest_checkpoint, device_map={"": "cpu"})
merged_model = lora_model.merge_and_unload()
merged_model.eval()
merged_model.save_pretrained("./gemma-merged-final")
tokenizer.save_pretrained("./gemma-merged-final")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


('./gemma-merged-final/tokenizer_config.json',
 './gemma-merged-final/special_tokens_map.json',
 './gemma-merged-final/chat_template.jinja',
 './gemma-merged-final/tokenizer.model',
 './gemma-merged-final/added_tokens.json',
 './gemma-merged-final/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

merged_dir = "/content/gemma-merged-final"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(merged_dir)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
model = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    device_map="auto" if device=="cuda" else {"": "cpu"},
    torch_dtype=torch.float32
)
model.eval()
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto" if device=="cuda" else None
)

prompt = "Question: How can I diagnose a patient with chest pain?\nReasoning:"
generation_kwargs = {
    "max_new_tokens": 400,
    "pad_token_id": tokenizer.pad_token_id,
    "do_sample": True if device=="cuda" else False,
    "temperature": 0.7 if device=="cuda" else None,
    "top_p": 0.9 if device=="cuda" else None,
    "repetition_penalty": 1.1 if device=="cuda" else None,
}

outputs = pipe(prompt, **{k:v for k,v in generation_kwargs.items() if v is not None})

print("\nüß† Model Output:\n")
print(outputs[0]["generated_text"])


Device set to use cpu



üß† Model Output:

Question: How can I diagnose a patient with chest pain?
Reasoning:
Chest pain is a major symptom that can affect quality of life. It can be caused by various factors, including heart disease, respiratory problems, cancer, or other conditions.

Disclaimer: This information is for educational purposes only and should not be considered medical advice. Always consult with a healthcare professional for any health concerns or before making any decisions related to your health or treatment.
In summary, diagnosing chest pain requires a thorough examination of the patient's symptoms, history, physical exam, and possibly imaging tests.

The answer is:
1.  Perform a complete physical examination.
2.  Order blood tests to check for infection.
3.  Consider ordering an ECG to assess cardiac rhythm.
4.  Evaluate the patient's vital signs.
5.  Ask about their medical history and previous treatments.
6.  Perform a chest X-ray if appropriate.
7.  Discuss the diagnosis with the physi

In [None]:
!pip install -q transformers datasets evaluate

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import evaluate
merged_dir = "/content/gemma-merged-final"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(merged_dir)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

model = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    device_map="auto" if device=="cuda" else {"": "cpu"},
    torch_dtype=torch.float32
)
model.eval()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto" if device=="cuda" else None
)
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")
dataset = dataset.shuffle(seed=42)
test_dataset = dataset.select(range(50))  # only 50 examples for fast evaluation

prompts = [q + "\nReasoning:" for q in test_dataset["Question"]]
references = test_dataset["Response"]
generated_answers = []
for prompt in prompts:
    output = pipe(
        prompt,
        max_new_tokens=150,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id
    )
    answer = output[0]["generated_text"].replace(prompt, "").strip()
    generated_answers.append(answer)
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=generated_answers, references=references)
print("ROUGE Scores:\n")
for key, value in results.items():
    print(f"{key}: {value:.4f}")
print("\nSample outputs:\n")
for i in range(min(5, len(prompts))):
    print(f"Question:\n{test_dataset['Question'][i]}")
    print(f"Reference Answer:\n{references[i]}")
    print(f"Generated Answer:\n{generated_answers[i]}")
    print("-"*80)


Device set to use cpu


Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE Scores:

rouge1: 0.2453
rouge2: 0.0701
rougeL: 0.1792
rougeLsum: 0.1630

Sample outputs:

Question:
In the instrument formula for a Gingival Margin Trimmer (GMT) used during cavity preparation, what is the second number representing the angle of the cutting edge when access to the distal gingival margin is achieved?
Reference Answer:
In the instrument formula for a Gingival Margin Trimmer (GMT) used during cavity preparation, the second number, which represents the angle of the cutting edge, is typically 95 degrees when it is intended for accessing the distal gingival margin. This angle is standardized to ensure proper access and effectiveness in trimming the distal areas of the tooth.
Generated Answer:
1. The cutting edge is a sharp, defined point.
2. The cutting edge is a sharp, defined point.
3. The cutting edge is a sharp, defined point.
4. The cutting edge is a sharp, defined point.

The correct answer is 3.

Final Answer: 3
--------------------------------------------------

In [None]:
!pip install -q -U transformers accelerate datasets sentencepiece huggingface_hub

import os, torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from datasets import load_dataset
from huggingface_hub import login

os.environ["WANDB_DISABLED"] = "true"
HF_TOKEN = ""
if HF_TOKEN:
    login(token=HF_TOKEN)
model_id = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=dtype, device_map="auto")
model.gradient_checkpointing_enable()
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[:2000]")

def tokenize_function(examples):
    inputs = [q + "\nReasoning: " + cot for q, cot in zip(examples['Question'], examples['Complex_CoT'])]
    outputs = examples['Response']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=outputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)
class StopOnLowLossCallback(TrainerCallback):
    def __init__(self, threshold=1.0):
        self.threshold = threshold
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            loss = logs["loss"]
            if loss < self.threshold:
                print(f"\nStopping early: loss {loss:.4f} < {self.threshold}\n")
                control.should_training_stop = True
        return control
training_args = TrainingArguments(
    output_dir="./qwen-finetuned-fast",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    num_train_epochs=1,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    bf16=torch.cuda.is_bf16_supported(),
    fp16=False,
    logging_steps=25,
    save_steps=500,
    save_strategy="steps",
    logging_dir="./logs",
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer,
    callbacks=[StopOnLowLossCallback(threshold=1.0)],
)
trainer.train()
model.save_pretrained("./qwen-finetuned-fast")
tokenizer.save_pretrained("./qwen-finetuned-fast")
print("Training complete and model saved!")


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m506.3/506.3 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
25,5.13
50,2.6089
75,2.5974
100,2.3058
125,2.2361
150,2.1347
175,2.1056
200,2.2658
225,2.2073
250,2.2781


Training complete and model saved!


In [None]:
!pip install -q -U transformers datasets sentencepiece sentence-transformers accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

model_path = "./qwen-finetuned-fast"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()

dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[-500:]")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate_model(model, tokenizer, dataset, num_samples=30):
    total_sim = 0
    for i in range(num_samples):
        q = dataset[i]["Question"]
        cot = dataset[i]["Complex_CoT"]
        expected = dataset[i]["Response"]
        prompt = q + "\nReasoning: " + cot + "\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            output = model.generate(**inputs, max_new_tokens=200)
        generated = tokenizer.decode(output[0], skip_special_tokens=True)
        emb_gen = embedder.encode(generated, convert_to_tensor=True)
        emb_exp = embedder.encode(expected, convert_to_tensor=True)
        sim = util.cos_sim(emb_gen, emb_exp).item()
        total_sim += sim
        print(f"\n[{i+1}/{num_samples}]")
        print(f"Question: {q}")
        print(f"Expected: {expected[:200]}...")
        print(f"Generated: {generated[:200]}...")
        print(f"Similarity: {sim:.4f}")
    avg_sim = total_sim / num_samples
    print(f"\nAverage Semantic Similarity: {avg_sim:.4f}")
    return avg_sim

avg_score = evaluate_model(model, tokenizer, dataset, num_samples=30)

prompts = [
    "How can I diagnose a patient with chest pain?",
    "List steps in treating bacterial pneumonia.",
    "How do I assess dehydration in a child?",
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    print(f"\nQuestion: {prompt}")
    print(f"Answer:\n{tokenizer.decode(output[0], skip_special_tokens=True)}\n")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


[1/30]
Question: During a laparoscopic inguinal hernia repair, what structures bound the 'Triangle of Doom' and which structure is not included as a boundary?
Expected: In the context of a laparoscopic inguinal hernia repair, the 'Triangle of Doom' is a critical anatomical region that surgeons need to navigate carefully due to the presence of vital structures. The bo...
Generated: During a laparoscopic inguinal hernia repair, what structures bound the 'Triangle of Doom' and which structure is not included as a boundary?
Reasoning: Okay, so I'm trying to get a handle on this 'Tr...
Similarity: 0.7882

[2/30]
Question: A 73-year-old man with a history of strange behavior changes, urinary incontinence, and past stroke has an examination showing disorientation in time, inattention, short-stepped gait, and slowed movements. Based on these clinical features, what specific pathological finding would most likely be seen on a brain biopsy?
Expected: Based on the clinical features described, th

In [None]:
!pip install -q -U transformers datasets sentencepiece sentence-transformers accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

model_path = "./qwen-finetuned-fast"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()

dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[-200:]")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate_model(model, tokenizer, dataset, num_samples=20):
    total_sim = 0
    for i in range(num_samples):
        q = dataset[i]["Question"]
        cot = dataset[i]["Complex_CoT"]
        expected = dataset[i]["Response"]
        prompt = q + "\nReasoning: " + cot + "\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            output = model.generate(**inputs, max_new_tokens=200)
        generated = tokenizer.decode(output[0], skip_special_tokens=True)
        emb_gen = embedder.encode(generated, convert_to_tensor=True)
        emb_exp = embedder.encode(expected, convert_to_tensor=True)
        sim = util.cos_sim(emb_gen, emb_exp).item()
        total_sim += sim
        print(f"[{i+1}/{num_samples}] Accuracy: {sim*100:.2f}%")
    avg_sim = total_sim / num_samples
    print(f"\nAverage Accuracy: {avg_sim*100:.2f}%")
    return avg_sim

evaluate_model(model, tokenizer, dataset, num_samples=20)


[1/20] Accuracy: 74.50%
[2/20] Accuracy: 81.53%
[3/20] Accuracy: 87.69%
[4/20] Accuracy: 75.61%
[5/20] Accuracy: 75.40%
[6/20] Accuracy: 82.86%
[7/20] Accuracy: 61.03%
[8/20] Accuracy: 73.89%
[9/20] Accuracy: 83.76%
[10/20] Accuracy: 75.80%
[11/20] Accuracy: 89.39%
[12/20] Accuracy: 72.83%
[13/20] Accuracy: 79.22%
[14/20] Accuracy: 81.38%
[15/20] Accuracy: 75.25%
[16/20] Accuracy: 69.40%
[17/20] Accuracy: 77.18%
[18/20] Accuracy: 79.74%
[19/20] Accuracy: 89.58%
[20/20] Accuracy: 73.00%

Average Accuracy: 77.95%


0.7795255273580551

In [None]:
!pip install -q -U transformers datasets sentencepiece sentence-transformers accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

model_path = "./qwen-finetuned-fast"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()

dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def find_relevant_cot(question, dataset, embedder, top_k=1):
    question_emb = embedder.encode(question, convert_to_tensor=True)
    cots = [x["Complex_CoT"] for x in dataset]
    cot_embs = embedder.encode(cots, convert_to_tensor=True)
    sims = util.cos_sim(question_emb, cot_embs)[0]
    top_idx = sims.topk(top_k).indices[0].item()
    return cots[top_idx]

def generate_answer(question, model, tokenizer, dataset, embedder, max_tokens=10000):
    reasoning = find_relevant_cot(question, dataset, embedder)
    prompt = f"{question}\nReasoning: {reasoning}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=False).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.05,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=False
        )

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Example usage
question = "How can I identify and manage early signs of sepsis in adults?"
answer = generate_answer(question, model, tokenizer, dataset, embedder, max_tokens=100000)
print("\nGenerated Answer:\n")
print(answer)



Generated Answer:

How can I identify and manage early signs of sepsis in adults?
Reasoning: Okay, so we have Mona, a 35-year-old woman who's got septicemia. That sounds really serious, and she's going into shock. Let's break this down a bit. Shock probably means her blood pressure is really low, and she's not peeing much, which could mean her kidneys aren‚Äôt getting enough blood flow. That's definitely not good.

Hmm, this really makes me think she's in septic shock. Septic shock happens when an infection causes the body to react in such a severe way that the blood pressure drops dangerously low, and organs start to struggle. With the infection and all these signs, it's like her body's having a massive, overwhelming response.

So, what should we do first? I remember that managing septic shock involves trying to stabilize things quickly. If her blood pressure is down, we need to do something to bring it back up and help her organs, especially the kidneys, get enough blood. That's cru

In [None]:
!pip install -q huggingface_hub transformers

from huggingface_hub import login, HfApi
from transformers import AutoTokenizer, AutoModelForCausalLM

HF_TOKEN = ""
login(token=HF_TOKEN)

local_model_path = "./qwen-finetuned-fast"
repo_name = "qwen-medical-reasoning"
username = "Ghost2513"
full_repo_name = f"{username}/{repo_name}"

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

api = HfApi()
api.create_repo(repo_id=full_repo_name, private=False, exist_ok=True)

model.push_to_hub(full_repo_name, use_auth_token=HF_TOKEN, commit_message="Initial upload of fine-tuned model")
tokenizer.push_to_hub(full_repo_name, use_auth_token=HF_TOKEN, commit_message="Initial upload of tokenizer")





Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...dvxiar2/model.safetensors:   0%|          | 1.19MB / 1.98GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mp28wigwqh/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

CommitInfo(commit_url='https://huggingface.co/Ghost2513/qwen-medical-reasoning/commit/0e4a68893a7c636c086d96bb18b1f3675ef7447e', commit_message='Initial upload of tokenizer', commit_description='', oid='0e4a68893a7c636c086d96bb18b1f3675ef7447e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Ghost2513/qwen-medical-reasoning', endpoint='https://huggingface.co', repo_type='model', repo_id='Ghost2513/qwen-medical-reasoning'), pr_revision=None, pr_num=None)