<a href="https://colab.research.google.com/github/BharathChinthallapalli/prompttune/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Global Configurations

In [1]:
# --- Global Configurations ---
from peft import PromptTuningInit # Added here for clarity
import os
import random
import torch # For device check in interactive cell
import io # For reading uploaded files

# Model Configuration
base_model_name = "bigscience/bloomz-560m"  # Base model for fine-tuning

# PEFT Configuration
peft_num_virtual_tokens = 8
peft_prompt_tuning_init = PromptTuningInit.RANDOM

# Tokenizer Configuration
max_seq_length = 128  # Maximum sequence length for tokenizer

# Training Configuration
training_output_dir = "./prompt_tuned_model" # Used for PEFT adapter too
training_learning_rate = 5e-4
training_num_epochs = 2 # Keep low for quick demo; increase for better results
training_per_device_batch_size = 2
training_report_to = "none" # Set to "wandb" or "tensorboard" if needed

# Evaluation Configuration
evaluation_per_device_batch_size = 2
evaluation_limit_samples = 20 # Number of validation samples to evaluate on, set to None for all

print("Global configurations set.")

Global configurations set.


In [2]:
# Install necessary libraries and set environment variables.
!pip install --quiet "transformers>=4.38.0" "peft>=0.8.0" "datasets" "accelerate" "bert-score" "evaluate" "fsspec>=2023.5.0"

import os
# Set TOKENIZERS_PARALLELISM to false to avoid potential deadlocks with tokenizers when using fork.
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import transformers
print("Transformers version:", transformers.__version__)
print("Pip install cell complete.")

Transformers version: 4.52.3
Pip install cell complete.


In [3]:
# Import necessary libraries for file handling and data manipulation.
from google.colab import files
import pandas as pd
# import io # Moved to global config cell

# Prompt user to upload CSV files.
print("Upload ALL your .csv files (train/val/prompt/response/etc).")
uploads = files.upload()
print(f"Uploaded file keys: {list(uploads.keys())}")

csv_files = [fname for fname in uploads.keys() if fname.endswith('.csv')]
print(f"Detected CSV files: {csv_files}")

dfs = {}
for fname in csv_files:
    print(f"Processing {fname}...")
    dfs[fname] = pd.read_csv(io.BytesIO(uploads[fname]))

print(f"Loaded {len(dfs)} dataframes: {list(dfs.keys())}")
print("Columns for each loaded dataframe:")
for fname, df in dfs.items():
    print(f"{fname}: {list(df.columns)}")

Upload ALL your .csv files (train/val/prompt/response/etc).


Saving labeled_train_final.csv to labeled_train_final (1).csv
Saving labeled_validation_final.csv to labeled_validation_final (1).csv
Saving prompt_examples_dataset.csv to prompt_examples_dataset (1).csv
Saving Prompt_Examples.csv to Prompt_Examples (1).csv
Saving Response_Examples.csv to Response_Examples (1).csv
Uploaded file keys: ['labeled_train_final (1).csv', 'labeled_validation_final (1).csv', 'prompt_examples_dataset (1).csv', 'Prompt_Examples (1).csv', 'Response_Examples (1).csv']
Detected CSV files: ['labeled_train_final (1).csv', 'labeled_validation_final (1).csv', 'prompt_examples_dataset (1).csv', 'Prompt_Examples (1).csv', 'Response_Examples (1).csv']
Processing labeled_train_final (1).csv...
Processing labeled_validation_final (1).csv...
Processing prompt_examples_dataset (1).csv...
Processing Prompt_Examples (1).csv...
Processing Response_Examples (1).csv...
Loaded 5 dataframes: ['labeled_train_final (1).csv', 'labeled_validation_final (1).csv', 'prompt_examples_dataset

In [4]:
# Helper: Find best-matching column for a role
def auto_col(df, choices):
    """Automatically selects the best column name from a list of choices.
    Tries to find an exact match first, then a case-insensitive match.
    :param df: The DataFrame to search for columns. :type df: pandas.DataFrame
    :param choices: A list of column names to search for, in order of preference. :type choices: list[str]
    :return: The best matching column name, or None if no match is found. :rtype: str | None
    """
    for c in choices:
        if c in df.columns: return c
    for c in choices:
        for cc in df.columns:
            if cc.lower() == c.lower(): return cc
    return None

all_train = []
all_val = []
first_sft_processed = False

for fname, df in dfs.items():
    print(f"Extracting SFT pairs from {fname}...")
    if 'improved_instruction' in df.columns:
        orig = auto_col(df, ['original_prompt', 'prompt', 'input'])
        ctx  = auto_col(df, ['context', 'task_context', ''])
        instr = auto_col(df, ['instruction'])
        tgt = auto_col(df, ['improved_instruction', 'target'])
        for _, row in df.iterrows():
            input_str = f"Original Prompt: {str(row.get(orig,''))}"
            if ctx and str(row.get(ctx,'')) and str(row.get(ctx,''))!='nan':
                input_str += f"\nContext: {row[ctx]}"
            if instr and str(row.get(instr,'')) and str(row.get(instr,'')) != str(row.get(orig,'')):
                input_str += f"\nInstruction: {row[instr]}"
            output_str = str(row[tgt])
            all_train.append({'input':input_str.strip(), 'output':output_str.strip()})
            if not first_sft_processed:
                print(f"  Sample input_str for SFT: {input_str.strip()}")
                print(f"  Sample output_str for SFT: {output_str.strip()}")
                first_sft_processed = True
    if 'bad_prompt' in df.columns and 'good_prompt' in df.columns:
        tdesc = auto_col(df, ['task_description'])
        tech = auto_col(df, ['prompting_techniques'])
        for _, row in df.iterrows():
            input_ = f"Task: {row[tdesc]}\nBad Prompt: {row['bad_prompt']}\nTechniques: {row[tech]}"
            all_train.append({'input': input_, 'output': row['good_prompt']})
    if 'Base_Prompt' in df.columns and 'V1_Prompt' in df.columns and 'V2_instruction' in df.columns:
        for _, row in df.iterrows():
            if str(row['Base_Prompt']) and str(row['V1_Prompt']):
                all_train.append({'input': row['Base_Prompt'], 'output': row['V1_Prompt']})
            if str(row['V1_Prompt']) and str(row['V2_instruction']):
                all_train.append({'input': row['V1_Prompt'], 'output': row['V2_instruction']})
print(f"Total SFT pairs initially extracted: {len(all_train)}")

# import random # Moved to global config cell
random.shuffle(all_train)
print("Shuffled all_train list.")

split = int(0.9*len(all_train))
train_data = all_train[:split]
val_data = all_train[split:]

print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(val_data)}")
print(f"Example training sample after processing: {train_data[0] if train_data else 'N/A'}")

Extracting SFT pairs from labeled_train_final (1).csv...
  Sample input_str for SFT: Original Prompt: I want you to generate a prompt for me identifying the key inputs you would need in order to generate extrememly specific OKRs for a person in  a b2b saas company.
  Sample output_str for SFT: Act as an expert AI specialized in crafting Objectives and Key Results (OKRs). Your task is to design the *ideal prompt* that would enable you to generate exceptionally specific, measurable, and relevant OKRs for any given individual working within a B2B SaaS company. This prompt must effectively solicit all necessary contextual information. Construct this prompt to explicitly request details regarding:
*   The individual's precise role, scope, and key responsibilities.
*   The overarching strategic goals of the company applicable to the OKR timeframe.
*   The specific objectives of the individual's team or department.
*   Quantifiable metrics or KPIs typically associated with the role or desired

In [5]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))

print(f"Hugging Face Train Dataset: {train_dataset}")
print(f"Hugging Face Validation Dataset: {val_dataset}")

Hugging Face Train Dataset: Dataset({
    features: ['input', 'output'],
    num_rows: 2451
})
Hugging Face Validation Dataset: Dataset({
    features: ['input', 'output'],
    num_rows: 273
})


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# model_name is now base_model_name from global config
tokenizer = AutoTokenizer.from_pretrained(base_model_name) # Use global config
print(f"Tokenizer loaded for {base_model_name}.")
model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
print(f"Model {base_model_name} loaded.")

def preprocess(batch):
    """Tokenizes the input and output batches for model training.
    :param batch: A batch of data. :type batch: dict
    :return: Tokenized inputs with labels. :rtype: dict
    """
    if batch['input']:
        print(f"Original input to preprocess (first item): {batch['input'][0]}")
    if batch['output']:
        print(f"Original output to preprocess (first item): {batch['output'][0]}")

    inputs = tokenizer(
        batch['input'], truncation=True, padding='max_length', max_length=max_seq_length # Use global config
    )
    labels = tokenizer(
        batch['output'], truncation=True, padding='max_length', max_length=max_seq_length # Use global config
    )
    inputs['labels'] = labels['input_ids']
    return inputs

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)
print(f"Train dataset after preprocessing: {train_dataset}")
if len(train_dataset) > 0:
    print(f"Sample processed train item: {train_dataset[0]}")

train_dataset.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
val_dataset.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizer loaded for bigscience/bloomz-560m.
Model bigscience/bloomz-560m loaded.


Map:   0%|          | 0/2451 [00:00<?, ? examples/s]

Original input to preprocess (first item): Original Prompt: Generate code for ESP8266 on Arduino IDE to do the following:

Store the following varialbles in a structure and provide default values for each. Save the structure in NVRAM.

char *ssid = "default_ssid";
char *ssidPW = "default_passphrase";
char *mqttBROKER = "192.168.2.6";
int HeatSP = 66;
int ExhaustSP = 80;
float deadBAND = 0.25;
bool circulatingfansEN = true;
bool circulatingfansINVERT = true;
bool circulatingfansStopOnHeat = true;
unsigned long updateMS = 2000;
unsigned long flashupdateMS = 60000;
unsigned long statusupdateMS = 10000;
unsigned long publishMS = 60000;

provide code to update all these variables in NVRAM from an MQTT command with a JSON string. 
Connect to the MQTT broker with non-blocking code.
Connect to the WiFi network with non-blocking code.
provide code to return the values with an MQTT command as a JSON message.
Provide code to update the ssid, ssidPW, and mqttBROKER from a JSON formatted MQTT comma

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

Original input to preprocess (first item): Original Prompt: I want to make software which implements machine learning in order for determining what the person depicted in the video provided can do to improve in the sport they are playing. The goal for the software is to be provided live video footage, to analyze the footage via the trained model, and then to determine corrective actions in order for the player to improve.
Original output to preprocess (first item): Develop a real-time machine learning system for sports coaching.

**System Functionality:**

1.  **Input:** Process live video footage capturing an athlete performing actions specific to their sport.
2.  **Core Analysis:**
    *   Employ computer vision techniques (e.g., pose estimation, action recognition) to extract key biomechanical data and movement patterns from the athlete in the video stream.
    *   Utilize a trained machine learning model to compare the athlete's real-time movements and technique against a pre-defin

In [7]:
from peft import get_peft_model, PromptTuningConfig, TaskType # PromptTuningInit is in global_config cell
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

tuning_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=peft_prompt_tuning_init, # Use global config
    num_virtual_tokens=peft_num_virtual_tokens, # Use global config
    tokenizer_name_or_path=base_model_name # Use global config
)
peft_model = get_peft_model(model, tuning_config)
print(f"PEFT model created with {tuning_config.num_virtual_tokens} virtual tokens.")

training_args = TrainingArguments(
    output_dir=training_output_dir, # Use global config
    per_device_train_batch_size=training_per_device_batch_size, # Use global config
    per_device_eval_batch_size=evaluation_per_device_batch_size, # Use global config
    learning_rate=training_learning_rate, # Use global config
    num_train_epochs=training_num_epochs, # Use global config
    logging_steps=10,
    report_to=training_report_to # Use global config
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
print("Starting model training...")
trainer.train()
print("Model training complete.")

PEFT model created with 8 virtual tokens.


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting model training...


Step,Training Loss
10,5.4738
20,5.4427
30,4.7145
40,5.2697
50,5.2239
60,5.9667
70,5.4507
80,4.8512
90,5.513
100,5.2261


Model training complete.


In [8]:
# --- Save PEFT Adapter and Tokenizer ---
print(f"Saving PEFT adapter and tokenizer to {training_output_dir}/final_adapter...")
peft_model_path = f"{training_output_dir}/final_adapter"
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path) # Save tokenizer with the adapter
print(f"PEFT adapter and tokenizer saved to {peft_model_path}")

# Optional: Persist to Colab disk if needed for later sessions (requires mounting Google Drive)
# from google.colab import drive
# drive.mount('/content/drive')
# !mkdir -p /content/drive/My\ Drive/prompt_tuned_model_adapters/
# !cp -r {peft_model_path} /content/drive/My\ Drive/prompt_tuned_model_adapters/
# print(f"Adapter also copied to Google Drive: /content/drive/My Drive/prompt_tuned_model_adapters/{os.path.basename(peft_model_path)}")

Saving PEFT adapter and tokenizer to ./prompt_tuned_model/final_adapter...
PEFT adapter and tokenizer saved to ./prompt_tuned_model/final_adapter


## Load Fine-tuned PEFT Adapter and Run Inference

In [13]:
# --- Load Fine-tuned PEFT Adapter ---
from transformers import AutoModelForCausalLM, AutoTokenizer # Ensure these are imported
from peft import PeftModel # Ensure PeftModel is imported
# import random # Already in global config cell
# import os # Already in global config cell

print(f"Loading base model ({base_model_name}) for PEFT adapter...")

base_model_for_loading = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer_for_loading = AutoTokenizer.from_pretrained(base_model_name) # Use base_model_name for consistency

peft_adapter_path = f"{training_output_dir}/final_adapter"
print(f"Loading PEFT adapter from: {peft_adapter_path}")

loaded_peft_model = PeftModel.from_pretrained(base_model_for_loading, peft_adapter_path)
loaded_peft_model.to(base_model_for_loading.device)
loaded_peft_model.eval()

print("PEFT model with fine-tuned adapter loaded successfully.")

# --- Example Inference with Loaded Adapter ---
if 'train_data' in globals() and train_data:
    fewshot_examples_for_loaded = random.sample(train_data, 2)
    test_prompt_loaded = "Describe a futuristic city."

    # build_fewshot_prompt needs to be defined *before* this cell runs.
    # The definition is now included below this cell in the notebook.

    fewshot_input_loaded_str = build_fewshot_prompt(test_prompt_loaded, fewshot_examples=fewshot_examples_for_loaded)
    print(f"\nTest prompt for loaded model: {test_prompt_loaded}")
    print(f"Few-shot input string for loaded model:\n{fewshot_input_loaded_str}")

    inputs_loaded = tokenizer_for_loading(fewshot_input_loaded_str, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length)
    inputs_loaded = {k: v.to(loaded_peft_model.device) for k, v in inputs_loaded.items()}

    print("\nGenerating output with loaded model...")
    outputs_loaded = loaded_peft_model.generate(
        input_ids=inputs_loaded["input_ids"],
        attention_mask=inputs_loaded["attention_mask"],
        max_new_tokens=100,
        eos_token_id=tokenizer_for_loading.eos_token_id,
        repetition_penalty=1.2
    )
    decoded_output_loaded = tokenizer_for_loading.decode(outputs_loaded[0], skip_special_tokens=True)

    answer_start_index = decoded_output_loaded.rfind("A:") + 2
    final_answer_loaded = decoded_output_loaded[answer_start_index:].strip() if answer_start_index > 1 else decoded_output_loaded

    print(f"\nGenerated Answer (loaded model):\n{final_answer_loaded}")
else:
    print("Skipping inference with loaded model as train_data is not available to create few-shot examples.")

Loading base model (bigscience/bloomz-560m) for PEFT adapter...
Loading PEFT adapter from: ./prompt_tuned_model/final_adapter
PEFT model with fine-tuned adapter loaded successfully.
Building few-shot prompt for user_prompt: Describe a futuristic city....
Constructed few-shot prompt (first 200 chars): Q: Original Prompt: give me product requirements for a disability services product in higher education
A: Generate a detailed list of product requirements for a software platform designed to manage an...

Test prompt for loaded model: Describe a futuristic city.
Few-shot input string for loaded model:
Q: Original Prompt: give me product requirements for a disability services product in higher education
A: Generate a detailed list of product requirements for a software platform designed to manage and deliver disability support services within a higher education institution. Assume the role of a Lead Product Manager developing this system.

**Primary Goals:**
1.  Streamline the process for s

In [12]:
def build_fewshot_prompt(user_prompt, fewshot_examples=[]):
    """Builds a few-shot prompt string from examples and a user query.
    :param user_prompt: The user's query. :type user_prompt: str
    :param fewshot_examples: List of dicts, each with 'input'/'output'. :type fewshot_examples: list[dict]
    :return: The constructed few-shot prompt. :rtype: str
    """
    print(f"Building few-shot prompt for user_prompt: {user_prompt[:100]}...")
    s = ""
    for ex in fewshot_examples:
        s += f"Q: {ex['input']}\nA: {ex['output']}\n"
    s += f"Q: {user_prompt}\nA:"
    print(f"Constructed few-shot prompt (first 200 chars): {s[:200]}...")
    return s

# This cell is for testing the build_fewshot_prompt function and inference with the original peft_model.
# Ensure 'train_data' is available from data preparation steps.
if 'train_data' in globals() and train_data:
    fewshot_examples_test = random.sample(train_data, 2) # Re-sample or use existing 'fewshot_examples'
    test_prompt_build = "Make me a summary about Berlin nightlife"
    print(f"\nTesting build_fewshot_prompt with: '{test_prompt_build}'")
    fewshot_input_str = build_fewshot_prompt(test_prompt_build, fewshot_examples=fewshot_examples_test)

    inputs_test = tokenizer(fewshot_input_str, return_tensors="pt")
    inputs_test = {k: v.to(peft_model.device) for k, v in inputs_test.items()}

    print("\nGenerating output with original peft_model for test_prompt_build...")
    outputs_test = peft_model.generate(
        input_ids=inputs_test["input_ids"], attention_mask=inputs_test["attention_mask"],
        max_new_tokens=128,
        repetition_penalty=1.2,
        eos_token_id=tokenizer.eos_token_id
    )
    decoded_output_test = tokenizer.decode(outputs_test[0], skip_special_tokens=True)
    print(f"Decoded output (original peft_model):\n{decoded_output_test}")
else:
    print("Skipping build_fewshot_prompt test as train_data is not available.")


Testing build_fewshot_prompt with: 'Make me a summary about Berlin nightlife'
Building few-shot prompt for user_prompt: Make me a summary about Berlin nightlife...
Constructed few-shot prompt (first 200 chars): Q: Original Prompt: Hello chatGPT! Today we will be establishing possible niches for a food and beverage professional. Please note, my interests and passions include wine, champagne, matchmaking, inte...

Generating output with original peft_model for test_prompt_build...
Decoded output (original peft_model):
Q: Original Prompt: Hello chatGPT! Today we will be establishing possible niches for a food and beverage professional. Please note, my interests and passions include wine, champagne, matchmaking, international delivery, signapore market.
Combining my interests and passions, please create 5 potential niches for me according to the following instructions:
1. The potential niche should be one that requires the service or skill identified above.
2. Provide a brief explanation 

In [16]:
import evaluate
# rouge = evaluate.load("rouge") # Defined in global config cell
# bertscore = evaluate.load("bertscore") # Defined in global config cell

def eval_on_val(model_to_eval, tokenizer_to_use, val_data_subset, current_fewshot_examples):
    """Evaluates the model on the validation set using ROUGE and BERTScore.
    :param model_to_eval: The model to evaluate (e.g. peft_model or loaded_peft_model)
    :type model_to_eval: PeftModel | AutoModelForCausalLM
    :param tokenizer_to_use: The tokenizer for the model.
    :type tokenizer_to_use: AutoTokenizer
    :param val_data_subset: The validation data subset.
    :type val_data_subset: list[dict]
    :param current_fewshot_examples: Few-shot examples to use in prompt construction.
    :type current_fewshot_examples: list[dict]
    """
    print("Starting evaluation on validation set...")
    refs, preds = [], []
    # Use evaluation_limit_samples from global_config
    eval_samples = val_data_subset[:evaluation_limit_samples] if evaluation_limit_samples is not None else val_data_subset

    for idx, item in enumerate(eval_samples):
        if idx < 3: # Print details for the first 3 samples
            print(f"  Evaluating item {idx+1} - Input: {item['input'][:100]}...")

        fewshot_input_str = build_fewshot_prompt(item['input'], fewshot_examples=current_fewshot_examples)
        inp = tokenizer_to_use(fewshot_input_str, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length) # Use global max_seq_length
        inp = {k: v.to(model_to_eval.device) for k, v in inp.items()}

        out = model_to_eval.generate(
            input_ids=inp["input_ids"], attention_mask=inp["attention_mask"],
            max_new_tokens=max_seq_length, # Max new tokens can also be parameterized
            eos_token_id=tokenizer_to_use.eos_token_id,
            repetition_penalty=1.2 # Added from previous inference example
        )

        input_length = inp["input_ids"].shape[1]
        generated_tokens = out[0][input_length:]
        pred = tokenizer_to_use.decode(generated_tokens, skip_special_tokens=True).strip()

        if idx < 3:
            print(f"    Generated prediction for item {idx+1}: {pred[:100]}...")

        preds.append(pred)
        refs.append(item['output'].strip())

    # Load metrics if not already loaded (e.g. if cell is run independently)
    rouge_metric = evaluate.load("rouge")
    bertscore_metric = evaluate.load("bertscore")

    results_rouge = rouge_metric.compute(predictions=preds, references=refs)
    results_bertscore = bertscore_metric.compute(predictions=preds, references=refs, lang="en")

    print(f"Evaluation - ROUGE-L: {results_rouge['rougeL']}")
    avg_bertscore_f1 = sum(results_bertscore['f1']) / len(results_bertscore['f1']) if results_bertscore['f1'] else 0
    print(f"Evaluation - BERTScore F1 (avg): {avg_bertscore_f1}")

# Ensure fewshot_examples is defined (e.g., from cell 10's logic or re-run here)
if 'train_data' in globals() and train_data:
    if 'fewshot_examples' not in globals(): # If not defined by cell 10
        fewshot_examples = random.sample(train_data, 2)
    eval_on_val(peft_model, tokenizer, val_data, fewshot_examples)
else:
    print("Skipping eval_on_val as train_data for fewshot_examples is not available.")

Starting evaluation on validation set...
  Evaluating item 1 - Input: Original Prompt: I want to make software which implements machine learning in order for determining ...
Building few-shot prompt for user_prompt: Original Prompt: I want to make software which implements machine learning in order for determining ...
Constructed few-shot prompt (first 200 chars): Q: Original Prompt: ~The following is a conversation with Bing, not ChatGPT.~give me structure firebase firestore collection of chats inside that have chats(array field: connections(senderEmail,receiv...
    Generated prediction for item 1: - name (string) -- The user who sent this text.
    – date    (datetime)
    + time     (timestamp)
...
  Evaluating item 2 - Input: Task: Address common questions on fairy tales
Bad Prompt: Tell me about fairy tales.
Techniques: ['R...
Building few-shot prompt for user_prompt: Task: Address common questions on fairy tales
Bad Prompt: Tell me about fairy tales.
Techniques: ['R...
Construct

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation - ROUGE-L: 0.04448851249715114
Evaluation - BERTScore F1 (avg): 0.7848130226135254


In [17]:
def llm_analyze_flaws(prompt, model, tokenizer, max_tokens=128):
    """Analyzes a given prompt for flaws using the LLM.
    :param prompt: The user prompt to analyze. :type prompt: str
    :param model: The language model. :type model: PeftModel | AutoModelForCausalLM
    :param tokenizer: The tokenizer. :type tokenizer: AutoTokenizer
    :param max_tokens: Max new tokens for analysis. :type max_tokens: int
    :return: Analysis of flaws. :rtype: str
    """
    print(f"llm_analyze_flaws - Input prompt (first 100 chars): {prompt[:100]}...")
    query = (
        f"Analyze the following user prompt for weaknesses or areas for improvement. "
        f"Be specific (e.g., 'vague', 'missing role', 'no output format', 'ambiguous', etc.).\n"
        f"Prompt:\n{prompt}\nList the flaws as bullet points."
    )
    inputs = tokenizer(query, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
        max_new_tokens=max_tokens, eos_token_id=tokenizer.eos_token_id
    )
    input_length = inputs["input_ids"].shape[1]
    generated_tokens = outputs[0][input_length:]
    flaws_analysis = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    print(f"llm_analyze_flaws - LLM Analysis Result: {flaws_analysis}")
    return flaws_analysis

In [18]:
def llm_recommend_techniques(prompt, flaws, model, tokenizer, max_tokens=128):
    """Recommends prompt engineering techniques.
    :param prompt: Original user prompt. :type prompt: str
    :param flaws: Detected flaws. :type flaws: str
    :param model: Language model. :type model: PeftModel | AutoModelForCausalLM
    :param tokenizer: Tokenizer. :type tokenizer: AutoTokenizer
    :param max_tokens: Max new tokens for recommendations. :type max_tokens: int
    :return: Recommended techniques. :rtype: str
    """
    print(f"llm_recommend_techniques - Input prompt (first 100 chars): {prompt[:100]}...")
    print(f"llm_recommend_techniques - Detected flaws: {flaws}")
    query = (
        f"Given this user prompt:\n{prompt}\n"
        f"And these detected flaws:\n{flaws}\n"
        f"List 2-4 specific prompt engineering techniques (e.g., 'CHAIN_OF_THOUGHT', 'SPECIFY_OUTPUT_FORMAT', "
        f"'ROLE_PROMPTING', 'ADD_EXAMPLES', 'ADD_CONSTRAINTS', etc.) that would improve the prompt. "
        f"List only technique names as a bullet list."
    )
    inputs = tokenizer(query, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
        max_new_tokens=max_tokens, eos_token_id=tokenizer.eos_token_id
    )
    input_length = inputs["input_ids"].shape[1]
    generated_tokens = outputs[0][input_length:]
    techniques_recommendation = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    print(f"llm_recommend_techniques - LLM Recommended Techniques: {techniques_recommendation}")
    return techniques_recommendation

In [19]:
def llm_synthesize_prompt(prompt, flaws, techniques, model, tokenizer, max_tokens=128):
    """Synthesizes an improved prompt using LLM.
    :param prompt: Original user prompt. :type prompt: str
    :param flaws: Detected flaws. :type flaws: str
    :param techniques: Recommended techniques. :type techniques: str
    :param model: Language model. :type model: PeftModel | AutoModelForCausalLM
    :param tokenizer: Tokenizer. :type tokenizer: AutoTokenizer
    :param max_tokens: Max new tokens for the synthesized prompt. :type max_tokens: int
    :return: Improved prompt. :rtype: str
    """
    print(f"llm_synthesize_prompt - Input prompt (first 100 chars): {prompt[:100]}...")
    print(f"llm_synthesize_prompt - Flaws: {flaws}")
    print(f"llm_synthesize_prompt - Techniques: {techniques}")
    query = (
        f"You are an expert prompt engineer. "
        f"Improve the following user prompt by explicitly addressing the listed flaws and applying these techniques.\n"
        f"User prompt: {prompt}\n"
        f"Detected flaws:\n{flaws}\n"
        f"Techniques to use:\n{techniques}\n"
        f"Write an improved prompt."
    )
    inputs = tokenizer(query, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
        max_new_tokens=max_tokens, eos_token_id=tokenizer.eos_token_id
    )
    input_length = inputs["input_ids"].shape[1]
    generated_tokens = outputs[0][input_length:]
    improved_prompt_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    print(f"llm_synthesize_prompt - LLM Synthesized Prompt: {improved_prompt_text}")
    return improved_prompt_text

In [20]:
def llm_prompt_chain(user_prompt, chain_model, chain_tokenizer, verbose=True):
    """Runs a 3-step LLM chain to analyze and improve a prompt.
    :param user_prompt: The user prompt to improve. :type user_prompt: str
    :param chain_model: The model to use for the chain. :type chain_model: PeftModel | AutoModelForCausalLM
    :param chain_tokenizer: The tokenizer for the model. :type chain_tokenizer: AutoTokenizer
    :param verbose: Whether to print intermediate steps. :type verbose: bool
    :return: The improved prompt. :rtype: str
    """
    print(f"Executing LLM Prompt Chain for: {user_prompt}")

    print("Step 1: Analyzing flaws...")
    flaws = llm_analyze_flaws(user_prompt, chain_model, chain_tokenizer)
    if verbose:
        print("\nDetected Flaws:\n", flaws, "\n", "-"*40)

    print("Step 2: Recommending techniques...")
    techniques = llm_recommend_techniques(user_prompt, flaws, chain_model, chain_tokenizer)
    if verbose:
        print("\nRecommended Techniques:\n", techniques, "\n", "-"*40)

    print("Step 3: Synthesizing improved prompt...")
    improved = llm_synthesize_prompt(user_prompt, flaws, techniques, chain_model, chain_tokenizer)
    if verbose:
        print("\nImproved Prompt:\n", improved, "\n", "-"*60)
    return improved

# Example of using the chain with the trained peft_model
# Ensure the model is on the correct device before calling the chain
if 'peft_model' in globals() and 'tokenizer' in globals():
    print("\n--- Testing LLM Prompt Improvement Chain with peft_model ---")
    # Ensure model is on the correct device (e.g., 'cuda' if available, else 'cpu')
    target_device = "cuda" if torch.cuda.is_available() else "cpu"
    if next(peft_model.parameters()).device.type != target_device:
         print(f"Moving peft_model to {target_device} for LLM chain test.")
         peft_model.to(target_device)

    test_chain_prompt = "Make me a summary about Berlin nightlife"
    improved_test_prompt = llm_prompt_chain(test_chain_prompt, peft_model, tokenizer, verbose=True)
    print("\n--- End of LLM Chain Test ---")
    print(f"Original Test Prompt: {test_chain_prompt}")
    print(f"Chain's Improved Test Prompt: {improved_test_prompt}")
else:
    print("Skipping LLM Prompt Chain test as peft_model or tokenizer is not available.")


--- Testing LLM Prompt Improvement Chain with peft_model ---
Executing LLM Prompt Chain for: Make me a summary about Berlin nightlife
Step 1: Analyzing flaws...
llm_analyze_flaws - Input prompt (first 100 chars): Make me a summary about Berlin nightlife...
llm_analyze_flaws - LLM Analysis Result: 

Detected Flaws:
  
 ----------------------------------------
Step 2: Recommending techniques...
llm_recommend_techniques - Input prompt (first 100 chars): Make me a summary about Berlin nightlife...
llm_recommend_techniques - Detected flaws: 
llm_recommend_techniques - LLM Recommended Techniques: 

Recommended Techniques:
  
 ----------------------------------------
Step 3: Synthesizing improved prompt...
llm_synthesize_prompt - Input prompt (first 100 chars): Make me a summary about Berlin nightlife...
llm_synthesize_prompt - Flaws: 
llm_synthesize_prompt - Techniques: 
llm_synthesize_prompt - LLM Synthesized Prompt: 

Improved Prompt:
  
 --------------------------------------------------

## Interactive Prompt Improvement

In [23]:
# --- Interactive Prompt Improvement Cell ---
# import torch # Already imported in global config cell

print("Ensure 'peft_model' (from training) or 'loaded_peft_model' (if loaded) and 'tokenizer' are available.")

# Determine which model to use (prefer loaded, fallback to trained)
interactive_model = None
if 'loaded_peft_model' in globals():
    interactive_model = loaded_peft_model
    print("Using 'loaded_peft_model' for interactive session.")
elif 'peft_model' in globals():
    interactive_model = peft_model
    print("Using 'peft_model' from training for interactive session.")
else:
    print("Error: No suitable model (peft_model or loaded_peft_model) found for interactive session.")

if interactive_model and 'tokenizer' in globals():
    # Ensure model is on the correct device
    target_device = "cuda" if torch.cuda.is_available() else "cpu"
    if next(interactive_model.parameters()).device.type != target_device:
        print(f"Moving interactive_model to {target_device}.")
        interactive_model.to(target_device)

    user_input_prompt = input("Enter your prompt to improve: ")

    if user_input_prompt:
        print("\n--- Running Prompt Improvement Chain ---")
        # llm_prompt_chain and its helpers should be defined in preceding cells
        improved_prompt_interactive = llm_prompt_chain(user_input_prompt, interactive_model, tokenizer, verbose=True)
        print("\n--- End of Chain ---")
        print(f"\nOriginal User Prompt: {user_input_prompt}")
        print(f"Chain's Improved Prompt: {improved_prompt_interactive}")
    else:
        print("No prompt entered. Skipping interactive improvement.")
else:
    print("Interactive session cannot start. Model or tokenizer not available.")

Ensure 'peft_model' (from training) or 'loaded_peft_model' (if loaded) and 'tokenizer' are available.
Using 'loaded_peft_model' for interactive session.
Enter your prompt to improve: ive been using bad anatomy, signature, watermark, username, error, missing limbs, error and its been going decently well, but im still seeing weird shit like legs turning into surfboards from time to time.  how bout you

--- Running Prompt Improvement Chain ---
Executing LLM Prompt Chain for: ive been using bad anatomy, signature, watermark, username, error, missing limbs, error and its been going decently well, but im still seeing weird shit like legs turning into surfboards from time to time.  how bout you
Step 1: Analyzing flaws...
llm_analyze_flaws - Input prompt (first 100 chars): ive been using bad anatomy, signature, watermark, username, error, missing limbs, error and its been...
llm_analyze_flaws - LLM Analysis Result: Question: 
What is the purpose of this question?

Detected Flaws:
 Question: 
W