In [13]:
prompt = """ Prompt:  Instruction-Following
You are an advanced text processing agent. Follow each of the 15 instructions below exactly and in order. Apply them to the provided paragraph. Be precise, strict with conditions, and make no omissions or assumptions beyond the rules.

🔧 Instructions:
Replace all acronyms with their full forms (e.g., AI → Artificial Intelligence).


Remove any sentence that contains more than two numerical digits.


Capitalize the first word of every sentence.


Do not alter the third sentence in the original paragraph.


Change passive voice to active voice in even-numbered sentences only.


Remove any sentence that starts with "However" or "Although."


If a sentence includes a city name, replace it with "[REDACTED]".


In any sentence containing the word “data,” add the word “sensitive” before “data.”


Highlight all proper nouns by surrounding them with ** (e.g., Google).


If a sentence is longer than 20 words, split it into two sentences.


Merge any two consecutive sentences that are both under 8 words.


Leave the final sentence unchanged, even if other rules apply.


Keep only the first and last paragraph; delete all others.


Add a summary line at the beginning: “ Edited for clarity and compliance.”


Return the final output as plain text with line breaks between paragraphs.


In 2023, researchers at MIT developed an AI-driven tool to streamline logistics in urban environments like New York and Tokyo. The tool uses IoT sensors to collect data from thousands of delivery trucks, ensuring real-time visibility into routes.
Although the technology was in development for over two years, it gained commercial traction only after successful trials in Berlin. It was said that the system was designed to be scalable and efficient. However, analysts at Stanford warned of data privacy concerns.
The software was later integrated with ERP systems by a team based in Lahore. This allowed seamless automation across multiple distribution hubs. One key feature involved predictive maintenance, powered by machine learning and natural language processing models.
These models were trained on over 250,000 historical delivery records, offering unparalleled insights. Meanwhile, the company’s R&D division in Toronto continued improving model accuracy. Despite setbacks, the project demonstrated remarkable efficiency gains. The final release was announced at the Smart Cities Expo in Dubai.




"""

In [45]:
from typing import List
import re

Make the data set
laod the phi model
define the functions
one for judjde the outputs
one in which this function will be called


In [100]:
import math
from collections import defaultdict
from typing import List, Dict

import openai


from openai import OpenAI

In [None]:
# keys
openai.api_key = "Yor api key"  # ⚠️ Replace with your actual key

In [102]:
client = OpenAI(api_key=openai.api_key)

In [98]:
def judge_instructions_compliance(instructions: List[str], original_paragraph: str, completion: str) -> int:
    """
    Calls OpenAI GPT model to judge how many of the instructions were followed exactly.

    Args:
        instructions: List of N instructions (ideally 5 per call)
        original_paragraph: The raw input paragraph
        completion: The model's edited output

    Returns:
        Integer count of how many instructions (0-N) were followed exactly
    """
    system_prompt = (
        "You are an expert evaluator of instruction-following in text editing tasks. "
        "You will be given a set of editing instructions, the original paragraph, and a modified paragraph. "
        "You must return ONLY the number of instructions that were followed exactly. "
        "Do not explain or justify. Just return a single number (0 to N)."
    )

    user_prompt = f"""
Instructions:
{chr(10).join(f"{i+1}. {instr}" for i, instr in enumerate(instructions))}

Original Paragraph:
{original_paragraph}

Edited Output:
{completion}

How many of the above {len(instructions)} instructions were followed exactly?
Return just the number, nothing else.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",  # or "gpt-3.5-turbo", or "o3-mini" if you prefer
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0
        )

        reply = response.choices[0].message.content.strip()
        match = re.search(r'\d+', reply)
        if match:
            value = int(match.group(0))
            return max(0, min(value, len(instructions)))
        else:
            print(f"Unexpected format in OpenAI response: {reply}")
            return 0

    except Exception as e:
        print(f"Error in OpenAI call: {e}")
        return 0

In [43]:
# helper function to extract the parapgah and  instructions
def extract_instructions_and_paragraph(prompt: str) -> tuple[list[str], str]:
    start_marker = "Instructions:"
    end_marker = "Return the final output as plain text with line breaks between paragraphs."

    if start_marker in prompt and end_marker in prompt:
        # Find start and end positions
        start_index = prompt.index(start_marker) + len(start_marker)
        end_index = prompt.index(end_marker) + len(end_marker)

        # Extract instruction block (excluding the "Instructions:" line itself)
        instructions_block = prompt[start_index:prompt.index(end_marker)].strip()

        # Split instructions into list by lines and remove empty ones
        instructions_list = [line.strip() for line in instructions_block.splitlines() if line.strip()]

        # Extract paragraph after end marker
        paragraph = prompt[end_index:].strip()

        if not instructions_list:
            raise ValueError("No instructions found between markers.")
        if not paragraph:
            raise ValueError("No paragraph found after the instructions block.")

        return instructions_list, paragraph

    raise ValueError("Start or end marker not found in prompt.")

In [44]:
# Helper function to split the instructions into different
def chunk_instructions(instructions: List[str], n_parts: int = 2) -> List[List[str]]:
    total = len(instructions)
    base_size = total // n_parts
    remainder = total % n_parts

    chunks = []
    start = 0

    for i in range(n_parts):
        # First 'remainder' chunks get one extra item
        chunk_size = base_size + (1 if i < remainder else 0)
        end = start + chunk_size
        chunks.append(instructions[start:end])
        start = end

    return chunks


In [96]:
from typing import List, Dict
from collections import defaultdict

def score(prompts: List[str], completions: List[str], models: List[str]) -> Dict[str, float]:
    """
    Scores prompt-completion pairs for compliance with instruction sets.

    Args:
        prompts: List of full prompts (including instructions + paragraph)
        completions: List of completions corresponding to the prompts
        models: List of model identifiers for each completion

    Returns:
        Dictionary of model_id → average compliance score (0.0 to 1.0)
    """
    assert len(prompts) == len(completions) == len(models), "Input lists must be equal length"

    model_scores = defaultdict(list)

    # === Extract instructions and paragraph from the first prompt ===
    full_instructions, Shared_paragraph = extract_instructions_and_paragraph(prompts[0])
    instruction_chunks = chunk_instructions(full_instructions, n_parts=2)  # Or adjust n_parts as needed

    for prompt, completion, model_id in zip(prompts, completions, models):

        total_followed = 0
        for chunk in instruction_chunks:
            followed = judge_instructions_compliance(chunk, Shared_paragraph, completion)
            total_followed += followed

        compliance_score = total_followed / len(full_instructions)
        model_scores[model_id].append(compliance_score)

    # === Average the compliance scores per model ===
    avg_scores = {
        model_id: sum(scores) / len(scores)
        for model_id, scores in model_scores.items()
    }

    return avg_scores


In [36]:
def chunk_instructions(instructions: List[str], n_parts: int = 2) -> List[List[str]]:
    total = len(instructions)
    base_size = total // n_parts
    remainder = total % n_parts

    chunks = []
    start = 0

    for i in range(n_parts):
        # First 'remainder' chunks get one extra item
        chunk_size = base_size + (1 if i < remainder else 0)
        end = start + chunk_size
        chunks.append(instructions[start:end])
        start = end

    return chunks


Making the dataset

In [92]:
import re
from typing import List
def read_prompt_as_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:

     return f.read()



In [64]:


def load_completions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    completions = re.findall(r"Completion \d+:\s*(.*?)\s*(?=Completion \d+:|$)", text, re.DOTALL)
    return [c.strip() for c in completions]

def load_models(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        models = [line.strip() for line in f.readlines() if line.strip()]
    return models


In [93]:
prompt = read_prompt_as_text("/content/Prompts.txt")
completions = load_completions("/content/Completions.txt")
model_ids = load_models("/content/Models.txt")

In [94]:
prompts = [prompt] * 3

In [95]:
prompts[0]

'Prompt:  Instruction-Following \nYou are an advanced text processing agent. Follow each of the 15 instructions below exactly and in order. Apply them to the provided paragraph. Be precise, strict with conditions, and make no omissions or assumptions beyond the rules.\n\n🔧 Instructions:\nReplace all acronyms with their full forms (e.g., AI → Artificial Intelligence).\n\n\nRemove any sentence that contains more than two numerical digits.\n\n\nCapitalize the first word of every sentence.\n\n\nDo not alter the third sentence in the original paragraph.\n\n\nChange passive voice to active voice in even-numbered sentences only.\n\n\nRemove any sentence that starts with "However" or "Although."\n\n\nIf a sentence includes a city name, replace it with "[REDACTED]".\n\n\nIn any sentence containing the word “data,” add the word “sensitive” before “data.”\n\n\nHighlight all proper nouns by surrounding them with ** (e.g., Google).\n\n\nIf a sentence is longer than 20 words, split it into two sente

In [79]:
print(prompt[0])

P


In [None]:
prompts = [""]

Evalauet t on data sets

In [103]:
results = score(prompts, completions, model_ids)

In [104]:
print(results)

{'Gemini': 0.5, 'Mistral': 0.5, 'OpenAi': 0.6428571428571429}
