In [None]:
! pip install huggingface-hub

In [None]:
! pip install datasets

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import os
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer  # trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported  # checks if the hardware supports bfloat16 operations

from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset  # Lets you load fine-tuning datasets in huggingface


os.environ["HF_TOKEN"] = "***"
hugging_face_token = os.environ["HF_TOKEN"]
login(hugging_face_token)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# paramerts to load pre-trained model
max_seq_length = 1024  # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None  # set to default
load_in_4bit = True  # Enables 4-bit quantization - a memory saving optimization


model_og, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model_og,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


# Load the R1 model and tokenizer using unsloth - imported using FastLanguageModel
# model_og, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B",  # ref: https://huggingface.co/unsloth/DeepSeek-R1
#     max_seq_length=max_seq_length,
#     dtype=dtype,
#     load_in_4bit=load_in_4bit,
#     token=hugging_face_token
# )


==((====))==  Unsloth 2025.3.1: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.81G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.78k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Unsloth 2025.3.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [8]:
prompt_style = """Given a end stage Heart Failue patient's on Mineralocorticoid receptor antagonists (MRAs) meal log
provide dietary feedback to the patients think and try to provide the question,
- Patients on MRAs need to control their potassium
- Patients with end stage heart failure need to control their sodium (salt), fluid, minerals, and fat.
- Be concise and to the point the recommendations should come from patient's meal log.
- Cant drink a lot of water because end stage Heart Failue need to control their fluid.

### Meal Log:
{}


### Response:
<think>{}"""

In [None]:
question = "9:00 AM - Grits with butter, 1:00 PM - Fried chicken with collard greens, 7:00 PM - Sweet potato pie."
FastLanguageModel.for_inference(model_og)
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")
# generate response using the model
outputs = model_og.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    use_cache=True
)
# decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])

In [None]:
# use supervised fine-tuning with training prompts and existing thinking
train_prompt_style = """Given a end stage Heart Failue patient's on Mineralocorticoid receptor antagonists (MRAs) meal log
provide dietary feedback to the patients think and try to provide the question,
- Patients on MRAs need to control their potassium
- Patients with end stage heart failure need to control their sodium (salt), fluid, minerals, and fat.
- Be concise and to the point the recommendations should come from patient's meal log.
- Cant drink a lot of water because end stage Heart Failue need to control their fluid.

### Meal Log:
{}

### Response (Dietician Recommendation):
<think>
{}
</think>
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs = examples["meal_log"]
    cots = examples["nutrition_diagnosis"]
    outputs = examples["follow_up_recommendations"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }


In [None]:
from datasets import load_dataset
# Load dataset from a local JSONL file
# dataset = load_dataset("json", data_files="dummy_coldstart.jsonl")

import json
from datasets import Dataset

def load_custom_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    meal_logs = []
    follow_up_recommendations = []
    nutrition_diagnosis = []

    for sample in data.values():
        meal_logs.append(sample["meal_log"])
        # print(sample.keys())
        follow_up_recommendations.append(sample["follow_up_recommendations"])
        nutrition_diagnosis.append(sample["nutrition_diagnosis"])
    return Dataset.from_dict({"meal_log": meal_logs, "nutrition_diagnosis": nutrition_diagnosis, "follow_up_recommendations": follow_up_recommendations})

# Load your custom JSON file
file_path = "gmdt_synthetic_data.json" #change to your file name
dataset = load_custom_json(file_path)

# Access the dataset split (default is 'train' if no split is defined)
print(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

model = FastLanguageModel.get_peft_model(
    model_og,
    r=8,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=2025,
    use_rslora=False,
    loftq_config=None,
)


Dataset({
    features: ['meal_log', 'nutrition_diagnosis', 'follow_up_recommendations'],
    num_rows: 1026
})


Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
dataset["train"][0]

{'meal_log': '11:30 AM - 1 cup Greek yogurt, 1/4 cup strawberries. 2:30 PM - 4 oz grilled chicken salad (mixed greens, cucumber, olive oil dressing). 8:30 PM - 4 oz baked cod, 1 cup steamed asparagus.',
 'nutrition_diagnosis': 'Potential for excessive potassium from strawberries.',
 'follow_up_recommendations': '1. Limit strawberries to 1/4 cup 3 times weekly. 2. Monitor potassium levels. 3. Use olive oil moderately. 4. Read nutrition labels.',
 'text': "Given a end stage Heart Failue patient's on Mineralocorticoid receptor antagonists (MRAs) meal log \nprovide dietary feedback to the patients think and try to provide the question,\n- Patients on MRAs need to control their potassium\n- Patients with end stage heart failure need to control their sodium (salt), fluid, minerals, and fat.\n- Be concise and to the point the recommendations should come from patient's meal log.\n- Cant drink a lot of water because end stage Heart Failue need to control their fluid.\n\n### Meal Log:\n11:30 AM 

In [None]:
# pip install --upgrade transformers peft

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
import transformers
import peft

print(f"Transformers version: {transformers.__version__}")
print(f"PEFT version: {peft.__version__}")

Transformers version: 4.49.0
PEFT version: 0.14.0


In [None]:
model_og, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model_og,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=2500,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=100,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
    report_to="none"
)

trainer_stats = trainer.train()


==((====))==  Unsloth 2025.3.1: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Converting train dataset to ChatML (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 923 | Num Epochs = 22
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,500
 "-____-"     Number of trainable parameters = 73,859,072


Step,Training Loss
100,0.4326
200,0.1753
300,0.1333
400,0.102
500,0.086
600,0.0743
700,0.0641
800,0.0558
900,0.0471
1000,0.0427


In [None]:
question = "B: coffee x2 with cream and sugar cranberry almond crunch cereal (1 cup) w 1 cup 2% milk or instant apple cinnamon porridge made with water and milk added or toast with pb and jam and 100 g source yogurt no lunch but snacks on jelly beans, jell0, plain dads oatmeal cookies, lots of grapes or potato chips D: pork chops, sausages, chx, with sweet or white potato and frozen vegetables-variety; frozen fish and chips; grill cheese with tomato soup 9 pm snack-ice cream +/- jello or brownies or coconut bowl or pb cookies or cupcakes or oatmeal banana cookies"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>
High sodium, saturated fat, potassium, and sugar intake.
</think>
1. Use low-sodium soy sauce or eliminate it from meals. 2. Use lean pork and trim visible fat. 3. Limit high-potassium fruits like bananas and replace with apples or berries. 4. Ensure sodium intake stays under 2,000 mg per day. 5. Avoid processed foods high in sodium. 6. Keep fluid intake to 2.5 liters or less daily.<｜end▁of▁sentence｜>


In [None]:
new_model_local = "DeepSeek-R1-GMDT-Qwen-COT"
model.save_pretrained(new_model_local)
tokenizer.save_pretrained(new_model_local)

model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.8G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.47 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 30.99it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DeepSeek-R1-GMDT-Qwen-COT/pytorch_model.bin...
Done.


In [None]:
#### -------- PUSH TO HUB -------- ####
# new_model_online = "***/DeepSeek-R1-Test-COT"
# model.push_to_hub(new_model_online)
# tokenizer.push_to_hub(new_model_online)

# model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

In [None]:
! zip -r DeepSeek-R1-GMDT-Qwen-COT.zip DeepSeek-R1-GMDT-Qwen-COT/

  adding: DeepSeek-R1-GMDT-Qwen-COT/ (stored 0%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/adapter_config.json (deflated 55%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/tokenizer.json (deflated 81%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/special_tokens_map.json (deflated 70%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/config.json (deflated 50%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/tokenizer_config.json (deflated 84%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/pytorch_model.bin (deflated 12%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/README.md (deflated 66%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/adapter_model.safetensors (deflated 8%)
  adding: DeepSeek-R1-GMDT-Qwen-COT/generation_config.json (deflated 37%)


In [None]:
import pandas as pd


df = pd.read_excel("GDMT data stacked KRYMAI.xlsx")
df.dropna(subset=["diet_history", "follow_up_recommendations"], inplace=True)
questions = df["diet_history"].values.tolist()
followup_recommendations = df["follow_up_recommendations"].values.tolist()
recommendations = []

for q in questions:
  inputs = tokenizer([prompt_style.format(q, "")], return_tensors="pt").to("cuda")

  outputs = model.generate(
      input_ids=inputs.input_ids,
      attention_mask=inputs.attention_mask,
      max_new_tokens=1200,
      use_cache=True,
  )
  response = tokenizer.batch_decode(outputs)
  print(f"Diet History: {q}")
  print(f"Recommendation: {response[0].split('### Response:')[1]}")
  recommendations.append(response[0].split("### Response:")[1])
  print("\n")


df_resp = pd.DataFrame()
df_resp["diet_history"] = questions
df_resp["actual_recommendations"] = followup_recommendations
df_resp["generated_recommendations"] = recommendations
df_resp.to_csv("trained_model_recommendations.csv", index=False)

Diet History: 9 am-gluten free bagel with peanut butter or salted butter or plain or frozen waffles/pancake-mix with real maple syrup or gluten-free oatmeal made with water plus brown sugar (1 TBSP) and berries 1300 hrs- cheese (2 slices of cheddar) or tuna sandwich plus dry apricots and 1 apple or gluten-free crackers with cheese or leftovers 5:50 pm-Fish with 2 cups rice with asparagus (6) and green beans (1 cup) or hamburger patty no bun and salad with prepared or homemade dressing or chicken breast (5 oz) with rice and vegs-1.5 cups (peppers, mushrooms, carrot, turnip, cabbage, tomatoes
Recommendation: 
<think>
Excessive sodium intake from processed or canned foods.
</think>
1. Avoid processed meats and canned vegetables. 2. Limit high-potassium fruits like bananas or replace with apples or berries. 3. Ensure sodium intake stays under 2,000 mg per day.<｜end▁of▁sentence｜>


Diet History: Breakfast at 9:30 am: 1 cup oatmeal made with water or milk with brown sugar with 1/4 cup milk o

In [None]:
df_resp.head()

In [None]:
len(df)

47

In [None]:
! pip install textstat rouge-score

Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [None]:
import math
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textstat.textstat import textstat
from sklearn.metrics import precision_recall_fscore_support
from transformers import pipeline

# Ensure nltk is properly downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# 1. Precision, Recall, F1-Score (Token-level calculation)
def precision_recall_f1(generated_text, reference_text):
    generated_tokens = nltk.word_tokenize(generated_text.lower())
    reference_tokens = nltk.word_tokenize(reference_text.lower())

    # True positives: Tokens in both reference and generated texts
    true_positives = sum(1 for token in generated_tokens if token in reference_tokens)

    # False positives: Tokens in generated text but not in reference text
    false_positives = sum(1 for token in generated_tokens if token not in reference_tokens)

    # False negatives: Tokens in reference text but not in generated text
    false_negatives = sum(1 for token in reference_tokens if token not in generated_tokens)

    # Calculate Precision, Recall, F1
    if true_positives + false_positives == 0:
        precision = 0
    else:
        precision = true_positives / (true_positives + false_positives)

    if true_positives + false_negatives == 0:
        recall = 0
    else:
        recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1


# 2. BLEU Score
def bleu_score(reference_text, generated_text):
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    generated_tokens = nltk.word_tokenize(generated_text.lower())

    # Using smoothing function to avoid zero BLEU scores
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing_function)


# 3. ROUGE Score
def rouge_score_fn(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    return scores


# 4. Perplexity Score (for Language Model)
def perplexity(generated_text):
    # Tokenize and compute perplexity using a simple unigram model
    tokens = nltk.word_tokenize(generated_text.lower())
    unique_tokens = set(tokens)
    token_count = len(tokens)

    # Compute unigram probabilities
    probabilities = [tokens.count(token) / token_count for token in unique_tokens]
    entropy = -sum([p * math.log2(p) for p in probabilities])
    return math.pow(2, entropy)


# 5. Flesch-Kincaid Readability Score
def flesch_kincaid_readability(generated_text):
    return textstat.flesch_kincaid_grade(generated_text)


# 6. Entropy
def entropy(generated_text):
    # Tokenize the text and compute entropy
    tokens = nltk.word_tokenize(generated_text.lower())
    token_count = len(tokens)
    token_frequencies = {token: tokens.count(token) / token_count for token in set(tokens)}
    entropy_value = -sum([p * math.log2(p) for p in token_frequencies.values()])
    return entropy_value

In [None]:
p, r, f, bleu, rogue1, rogue2, rogueL, pp, fk, ent = [], [], [], [], [], [], [], [], [], []
for reference_text, generated_text in zip(followup_recommendations[1:], recommendations[1:]):
    s = precision_recall_f1(generated_text, reference_text)
    p.append(s[0])
    r.append(s[1])
    f.append(s[2])

    bleu.append(bleu_score(reference_text, generated_text))
    s = rouge_score_fn(reference_text, generated_text)
    rogue1.append(s['rouge1'].fmeasure)
    rogue2.append(s['rouge2'].fmeasure)
    rogueL.append(s['rougeL'].fmeasure)

    pp.append(perplexity(generated_text))
    fk.append(flesch_kincaid_readability(generated_text))
    ent.append(entropy(generated_text))

In [None]:
print(f"""
Trained Qwen-1B recommendation performance:
      \n Precision: {round(sum(p)/len(p), 4)}, Recall: {round(sum(r)/len(r), 4)}, F1-score: {round(sum(f)/len(f), 4)}
      \n BLEU Score: {round(sum(bleu)/len(bleu), 4)}, ROGUE-1: {round(sum(rogue1)/len(rogue1), 4)}, ROGUE-2: {round(sum(rogue2)/len(rogue2), 4)}, ROGUE-L: {round(sum(rogueL)/len(rogueL), 4)}
      \n Perplexity: {round(sum(pp)/len(pp), 4)}, Flesch-Kincaid Readability Grade: {round(sum(fk)/len(fk), 4)}, Entropy: {round(sum(ent)/len(ent), 4)}
""")


Trained Qwen-1B recommendation performance:
      
 Precision: 0.2727, Recall: 0.2484, F1-score: 0.2445
      
 BLEU Score: 0.0161, ROGUE-1: 0.2835, ROGUE-2: 0.0521, ROGUE-L: 0.1779
      
 Perplexity: 36.0484, Flesch-Kincaid Readability Grade: 8.7043, Entropy: 5.1414



# Comparing to the performance without SFT

In [None]:
question = "9:00 AM - Grits with butter, 1:00 PM - Fried chicken with collard greens, 7:00 PM - Sweet potato pie."
FastLanguageModel.for_inference(model_og)


wo_sft_recommendation = []
for q in questions:
  inputs = tokenizer([prompt_style.format(q, "")], return_tensors="pt").to("cuda")

  outputs = model_og.generate(
      input_ids=inputs.input_ids,
      attention_mask=inputs.attention_mask,
      use_cache=True
  )
  response = tokenizer.batch_decode(outputs)
  print(f"Diet History: {q}")
  print(f"Recommendation: {response[0].split('### Response:')[1]}")
  wo_sft_recommendation.append(response[0].split("### Response:")[1])
  print("\n")

df_resp = pd.DataFrame()
df_resp["diet_history"] = questions
df_resp["actual_recommendations"] = followup_recommendations
df_resp["generated_recommendations_wo_sft"] = wo_sft_recommendation
df_resp.to_csv("wo_sft_model_recommendations.csv", index=False)

In [None]:
p, r, f, bleu, rogue1, rogue2, rogueL, pp, fk, ent = [], [], [], [], [], [], [], [], [], []
for reference_text, generated_text in zip(followup_recommendations[1:], wo_sft_recommendation[1:]):
    s = precision_recall_f1(generated_text, reference_text)
    p.append(s[0])
    r.append(s[1])
    f.append(s[2])

    bleu.append(bleu_score(reference_text, generated_text))
    s = rouge_score_fn(reference_text, generated_text)
    rogue1.append(s['rouge1'].fmeasure)
    rogue2.append(s['rouge2'].fmeasure)
    rogueL.append(s['rougeL'].fmeasure)

    pp.append(perplexity(generated_text))
    fk.append(flesch_kincaid_readability(generated_text))
    ent.append(entropy(generated_text))

In [None]:
print(f"""
OOB Qwen-1B recommendation performance:
      \n Precision: {round(sum(p)/len(p), 4)}, Recall: {round(sum(r)/len(r), 4)}, F1-score: {round(sum(f)/len(f), 4)}
      \n BLEU Score: {round(sum(bleu)/len(bleu), 4)}, ROGUE-1: {round(sum(rogue1)/len(rogue1), 4)}, ROGUE-2: {round(sum(rogue2)/len(rogue2), 4)}, ROGUE-L: {round(sum(rogueL)/len(rogueL), 4)}
      \n Perplexity: {round(sum(pp)/len(pp), 4)}, Flesch-Kincaid Readability Grade: {round(sum(fk)/len(fk), 4)}, Entropy: {round(sum(ent)/len(ent), 4)}
""")