In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

HF_TOKEN = "hf_...."
MODEL_REPO = "meta-llama/Llama-3.1-8B-Instruct"
USE_CHAT = True

login(HF_TOKEN)

In [2]:
print("Loading the first 10 cases.")
df = pd.read_csv("/content/Rad_filtered_data_final_v8.csv", nrows=10)
print(f"Loaded {len(df)} cases.")

Loading the first 10 cases.
Loaded 10 cases.


In [3]:
# tokenizer and model.
from transformers.utils import logging
import time
from pathlib import Path
logging.set_verbosity_info()

def download_with_retry(repo_id, token, max_retries=5, **kwargs):
    for attempt in range(max_retries):
        try:
            return AutoTokenizer.from_pretrained(repo_id, token=token, **kwargs)
        except Exception as e:
            if attempt == max_retries - 1:
                raise e
            print(f"Attempt {attempt + 1} failed, retrying in {2 ** attempt} seconds.")
            time.sleep(2 ** attempt)

print("\nSetting up the model.")

print("Loading tokenizer.")
tokenizer = download_with_retry(MODEL_REPO, token=HF_TOKEN, use_fast=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# loading the base model with retries.
print("Loading base model.")
for attempt in range(5):
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO,
            token=HF_TOKEN,
            device_map="auto",
            torch_dtype=torch.float32,
            local_files_only=False,
            resume_download=True,
            low_cpu_mem_usage=True,
        )
        break
    except Exception as e:
        if attempt == 4:
            raise e
        print(f"Attempt {attempt + 1} failed, retrying in {2 ** attempt} seconds.")
        time.sleep(2 ** attempt)

base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.eval()

print("Model loaded.")


Setting up the model.
Loading tokenizer.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.json
loading file tokenizer.model from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading base model.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json
`torch_dtype` is deprecated! Use `dtype` instead!
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "float32",
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_the

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/model.safetensors.index.json


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Instantiating LlamaForCausalLM model under default dtype torch.float32.
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ]
}



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

Could not locate the custom_generate/generate.py inside meta-llama/Llama-3.1-8B-Instruct.


Model loaded.


In [4]:
def generate_answer(model, prompt: str, max_new_tokens: int = 256) -> str:
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )

    gen = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return gen.split("Answer:")[-1].strip()

print("\nGenerating the answers.")
base_answers = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    if USE_CHAT:
        prompt = tokenizer.apply_chat_template([
            {"role": "system", "content": "You are an expert radiologist. Provide accurate, evidence-based answers using the provided medical context."},
            {"role": "user", "content": f"Context:\n{row['impression']}\n\nQuestion: What are the key findings in this case?\n\nAnswer:"}
        ], tokenize=False)
    else:
        prompt = f"Context:\n{row['impression']}\n\nQuestion: What are the key findings in this case?\n\nAnswer:"

    answer = generate_answer(base_model, prompt)
    base_answers.append(answer)

results_df = df.copy()
results_df['base_answer'] = base_answers
results_df.to_csv('baseline_results_10examples.csv', index=False)
print("\nResults saved to baseline_results_10examples.csv")


Generating the answers.


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p'].
- `temperature`: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
- `top_p`: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
If you're using a pretrained model, note that some of these attributes may be set through the model's `generation_config.json` file.
100%|██████████| 10/10 [01:32<00:00,  9.22s/it]


Results saved to baseline_results_10examples.csv



