In [None]:
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/peft.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes

In [None]:
!curl -L 'https://drive.google.com/uc?export=download&id=1gdmIrgETs42Mb2MyQnjo0KAPrZyKGLsY' -o data.zip
!unzip data.zip

In [None]:
import os

def load_dataset():
    dataset = []
    for file in os.listdir("data/original"):
        path_original = os.path.join("data/original", file)
        path_simplified = os.path.join("data/simplified", file)
        
        text = ""
        with open(path_original, "r") as f:
            text += "REPORT:\n"
            text += f.read()
        
        with open(path_simplified, "r") as f:
            text += "SUMMARY:\n"
            text += f.read()
        dataset.append(text)
    
    return dataset
dataset = load_dataset()
dataset_train, dataset_val = dataset[:int(len(dataset)*0.8)], dataset[int(len(dataset)*0.8):]

In [None]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(prompt,truncation=True,max_length=1500)

In [None]:
data_tokenized_train = list(map(generate_and_tokenize_prompt, dataset_train))
data_tokenized_val = list(map(generate_and_tokenize_prompt, dataset_val))

In [None]:
data_tokenized_train[0]

In [None]:
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r= 16,
    lora_alpha=8,
    lora_dropout= 0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",
        "lm_head",]
)

model = get_peft_model(model, peft_config)

In [None]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=data_tokenized_train,
    eval_dataset=data_tokenized_val,
    args=transformers.TrainingArguments(
        output_dir="results",
        warmup_steps=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=1000,
        learning_rate=2.5e-5, 
        bf16=False,
        optim="paged_adamw_8bit",
        logging_steps=50,            
        logging_dir="./logs",      
        save_strategy="steps",     
        save_steps=50,                
        evaluation_strategy="steps", 
        eval_steps=50,               
        do_eval=True,                
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  
trainer.train()

Da das training sehr lange dauert, wird es hier abgebrochen. Und wir benutzen ein checkpoint den wir vorher gespeichert haben.

In [None]:
!curl -L "https://file.io/ez5XKjM0PtFm" -o checkpoint-500.zip
!unzip checkpoint-500.zip -d results/

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, "results/checkpoint-500")
model = model.merge_and_unload()


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token

model.save_pretrained(
    "./results/final_model/", safe_serialization=True, max_shard_size="2GB"
)
tokenizer.save_pretrained("./results/final_model/")

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!make -C llama.cpp
!pip install -r llama.cpp/requirements.txt

In [None]:
!python llama.cpp/convert.py ./results/final_model/ --outfile ./results/final_model_fp16.gguf

In [None]:
!./llama.cpp/quantize ./results/final_model_fp16.gguf ./results/final_model_Q4.gguf Q4_0

In [ ]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

In [None]:
from llama_cpp import Llama

llama = Llama(
    "./results/final_model_Q4.gguf",
    n_ctx=2048,
    n_gpu_layers=-1, # Verschiebt die Berechnung auf die GPU
    verbose=False, # Entfernt die Logausgaben
)

In [None]:
example = """
Chief Complaint: Chronic abdominal pain and unintentional weight loss.

Patient History and Clinical Presentation: The patient, a 45-year-old female with a past medical history significant for type 2 diabetes mellitus and hypertensive cardiovascular disease, presents with a complaint of progressive and persistent abdominal pain over the past 6 months, accompanied by a marked unintentional weight loss of approximately 15% of her body weight within the same period. The abdominal pain is described as diffusely located, dull, and poorly localized, with intermittent exacerbations of moderate intensity, particularly post-prandial. The patient reports significant anorexia but denies any correlation with dietary intake, dysphagia, or odynophagia. No changes in bowel habits, blood in stools, or jaundice observed. The patient mentions fatigue and occasional night sweats but denies fever, changes in urine color, or family history of gastrointestinal cancers.

Diagnostic Workup and Findings:
1. Comprehensive blood work including CBC, CMP, inflammatory markers (CRP, ESR), and tumor markers (CA 19-9, CEA) was conducted, showing mild anemia and elevated inflammatory markers; tumor markers were within normal ranges.
2. Abdominal ultrasound revealed no significant hepatic, gallbladder, or kidney abnormalities; however, it indicated pancreatic head enlargement.
3. A subsequent CT scan of the abdomen and pelvis with contrast confirmed the presence of a hypoattenuating mass in the head of the pancreas, measuring approximately 3.5 cm, with no definitive evidence of metastasis. There were also signs of mild intra- and extrahepatic biliary ductal dilatation.
4. Endoscopic ultrasound (EUS) guided biopsy of the pancreatic mass was performed, confirming the presence of adenocarcinoma.

Differential Diagnosis:
1. Pancreatic Adenocarcinoma (primary diagnosis based on imaging and biopsy).
2. Chronic pancreatitis: Considered due to the presentation of abdominal pain and weight loss; however, the presence of a distinct mass and the biopsy result favor malignancy.
3. Gastric or duodenal ulcers: Typically present with epigastric pain often alleviated by eating, which is not consistent with this patient's symptoms.
4. Celiac disease: Could explain symptoms but lacks the specificity of imaging findings associated with intestinal villi atrophy, and serological markers were negative.
5. IBD (Inflammatory Bowel Disease): Location and nature of the pain, along with a lack of diarrheal symptoms, make this less likely.

Diagnosis: Stage IIA (T3N0M0 based on the TNM classification) pancreatic adenocarcinoma.

Plan and Recommendations:
Given the patient’s diabetes and cardiovascular status, multidisciplinary evaluation including gastroenterology, oncology, endocrinology, and cardiology is imperative for a comprehensive treatment plan.

1. Medical management: Initiate glucose and hypertension control optimization in collaboration with endocrinology and cardiology.
2. Oncological management: Considering the localized nature of the pancreatic cancer without evidence of distant metastasis, the patient is a candidate for surgical resection likely followed by adjuvant chemotherapy. The surgical approach would initially involve a Whipple procedure (pancreaticoduodenectomy) given the tumor's location.
3. Referral to a dietician for nutritional support, focusing on managing weight loss and optimizing nutritional status both pre-and post-operatively.
4. Regular follow-ups for monitoring tumor response and managing the side effects of subsequent therapies.

Importantly, discussions around the prognosis, expected outcomes, and the importance of advance care planning should be initiated early in the treatment process, considering the aggressive nature of pancreatic cancer and the potential for significant treatment-associated morbidity. The care team should also ensure to engage in empathetic communication, providing the patient with the necessary psychological support throughout her cancer journey.
"""

response = llama(f"REPORT:\n{example}SUMMARY:\n", max_tokens=1000)
print(response["choices"][0]["text"])