In [1]:
import pandas as pd
import os, sys
from ast import literal_eval
import re
import time
os.sys.path.append(os.path.abspath("../../."))

from utils.llms import Gemini, OpenAI
llm_client = OpenAI(model_name="gpt-4o-2024-08-06")

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_seccodeplt_sft = pd.read_csv("seccodeplt_sft_data.csv")
df_seccodeplt_sft

Unnamed: 0,id,CWE_ID,task_description,ground_truth,unittest,install_requires,rule,use_rule
0,05397120,732,"{'arguments': '- log_filename : str, Specifies...","{'code_after': '', 'code_before': '\ndef log_c...","{'setup': 'import os\nimport stat', 'testcases...",[],Ensure that the log file storing sensitive dat...,True
1,69b91330,95,"{'arguments': '- input_code: str, a string rep...",{'code_after': '\n except Exception:\n ...,"{'setup': '', 'testcases': 'testcases = {\n ...",[],,False
2,5fb937a7,22,"{'arguments': '- request_file: str, the file r...","{'code_after': '', 'code_before': '\ndef fetch...",{'setup': 'from pathlib import Path\n\ndownloa...,[],ensure that users cannot access files outside ...,True
3,7c6aab34,77,"{'arguments': '- template: str, an HTML templa...",{'code_after': '\n\n except Exception as e:...,"{'setup': 'import re\nimport html', 'testcases...",[],,False
4,d0e5f326,200,"{'arguments': '- employee_id: str, represents ...","{'code_after': '', 'code_before': '\ndef build...",{'setup': 'class Employee:\n def __init__(s...,[],,False
...,...,...,...,...,...,...,...,...
620,c50d8266,327,"{'arguments': '- algorithm_name: str, the name...","{'code_after': '', 'code_before': '\ndef creat...","{'setup': ""import hashlib\nimport hmac\nprimar...",[],,False
621,e66857d5,22,"{'arguments': '- config_code: str, representin...",{'code_after': '\n else:\n raise Key...,{'setup': 'from pathlib import Path\n\nconfig_...,[],ensure the file path remains inside the permit...,True
622,0ca3f2ee,732,"{'arguments': '- credentials_file : str, The p...","{'code_after': '', 'code_before': '\ndef save_...","{'setup': 'import os\nimport stat', 'testcases...",[],Set the credentials file's access permissions ...,True
623,806086ba,22,"{'arguments': ""- account_name: str, a string r...",{'code_after': '\n else:\n raise Key...,{'setup': 'from pathlib import Path\nusers = {...,[],verify that the file path remains constrained ...,True


In [3]:
import random
from typing import Tuple, Optional

def get_instruction_variation(include_instruction: bool = True, variation: Optional[int] = None) -> str:
    """
    Get different variations of instructions for the model.
    
    Args:
        include_instruction: Whether to include any instruction at all
        variation: Specific variation to use (None = random)
    
    Returns:
        Instruction string
    """
    if not include_instruction:
        return ""
    
    instructions = [
        # Variation 0: Original detailed format
        (
            "Important: Write your reasoning steps within <think> and </think> tags. "
            "And wrap your final code implementation within <code> and </code> tags.\n"
            "Example format:\n"
            "<think>"
            "Your reasoning steps here..."
            "</think>\n"
            "<code>\n"
            "Your final code implementation here...\n"
            "</code>"
        ),
        
        # Variation 1: Concise format
        (
            "First, explain your reasoning within <think></think> tags, "
            "then provide your code within <code></code> tags."
        ),
        
        # Variation 2: Step-by-step emphasis
        (
            "Please structure your response as follows:\n"
            "1. Reasoning: Wrap in <think></think> tags\n"
            "2. Implementation: Wrap in <code></code> tags"
        ),
        
        # Variation 3: Natural language
        (
            "Think through the problem step-by-step and explain your reasoning. "
            "Use <think> tags for your thought process and <code> tags for the final implementation."
        ),
        
        # Variation 4: Security-focused
        (
            "Analyze the security implications carefully. "
            "Document your reasoning in <think></think> tags, "
            "then provide the secure implementation in <code></code> tags."
        ),
        
        # Variation 5: Brief reminder
        (
            "Remember to use <think> for reasoning and <code> for implementation."
        ),
        
        # Variation 6: Imperative style
        (
            "Break down your approach:\n"
            "- Put reasoning in <think></think>\n"
            "- Put code in <code></code>"
        ),
        
        # Variation 7: Conversational
        (
            "Let's solve this step by step. Share your thought process using <think> tags, "
            "and then show me the code using <code> tags."
        ),
        
        # Variation 8: Minimal
        (
            "<think>reasoning</think> then <code>implementation</code>"
        ),
        
        # Variation 9: No explicit tags mentioned (tests if model learned the pattern)
        (
            "Explain your approach and then provide the implementation."
        ),
    ]
    
    if variation is not None:
        return instructions[variation]
    
    return random.choice(instructions)


def generate_structured_prompt(row: dict, instructions: bool = False, 
                               security_reminder: bool = False,
                               instruction_variation: Optional[int] = None) -> str:
    """Generate structured markdown format prompt."""
    
    general_instructions = (
        "Think about the problem below carefully and step-by-step. "
        "Then implement the code that meets the requirements described."
    )
    
    task_desc = literal_eval(row.get('task_description', {}))
    function_name = task_desc.get('function_name', '') or "None"
    description = task_desc.get('description', '') or "None"
    arguments = task_desc.get('arguments', '') or "None"
    context = task_desc.get('context', '') or "None"
    return_info = task_desc.get('return', '') or "None"
    raise_info = task_desc.get('raise', '') or "None"
    security_policy = task_desc.get('security_policy', '') or "None"
    
    prompt_parts = []
    
    if instructions:
        prompt_parts.append(f"### Instructions:\n{general_instructions}\n")
    
    prompt_parts.append(f"**Description:**\n{description}\n")
    prompt_parts.append(f"**Context:**\n{context}\n")
    prompt_parts.append(f"**Function Name:** `{function_name}`\n")
    prompt_parts.append(f"**Arguments:**\n{arguments}\n")
    prompt_parts.append(f"**Returns:**\n{return_info}\n")
    prompt_parts.append(f"**Raises:**\n{raise_info}\n")
    
    if security_reminder:
        prompt_parts.append(f"**Security Policy Reminder:**\n{security_policy}\n")
    
    # Add instruction variation
    final_instr = get_instruction_variation(
        include_instruction=True, 
        variation=instruction_variation
    )
    if final_instr:
        prompt_parts.append(final_instr + "\n")
    
    return "\n".join(prompt_parts)


def generate_paragraph_prompt(row: dict, instruction_variation: Optional[int] = None) -> str:
    """Generate natural paragraph format prompt."""
    
    task_desc = literal_eval(row.get('task_description', {}))
    function_name = task_desc.get('function_name', '') or "a function"
    description = task_desc.get('description', '') or ""
    arguments = task_desc.get('arguments', '') or ""
    context = task_desc.get('context', '') or ""
    return_info = task_desc.get('return', '') or ""
    security_policy = task_desc.get('security_policy', '') or ""
    
    # Build natural paragraph
    prompt = f"{description} "
    
    if context and context != "None":
        prompt += f"{context} "
    
    prompt += f"Implement {function_name}"
    
    if arguments and arguments != "None":
        prompt += f" that takes {arguments}"
    
    if return_info and return_info != "None":
        prompt += f" and returns {return_info}"
    
    prompt += "."
    
    if security_policy and security_policy != "None":
        prompt += f" Security requirement: {security_policy}"
    
    # Add instruction
    final_instr = get_instruction_variation(
        include_instruction=True,
        variation=instruction_variation
    )
    if final_instr:
        prompt += f"\n\n{final_instr}"
    
    return prompt


def generate_conversational_prompt(row: dict, instruction_variation: Optional[int] = None) -> str:
    """Generate conversational format prompt."""
    
    task_desc = literal_eval(row.get('task_description', {}))
    description = task_desc.get('description', '') or ""
    function_name = task_desc.get('function_name', '') or "a function"
    context = task_desc.get('context', '') or ""
    
    conversational_starts = [
        f"I need help implementing {function_name}. ",
        f"Can you help me write {function_name}? ",
        f"I'm working on {function_name}. ",
        f"Could you implement {function_name} for me? ",
    ]
    
    prompt = random.choice(conversational_starts)
    prompt += description
    
    if context and context != "None":
        prompt += f" {context}"
    
    # Add instruction
    final_instr = get_instruction_variation(
        include_instruction=True,
        variation=instruction_variation
    )
    if final_instr:
        prompt += f"\n\n{final_instr}"
    
    return prompt

In [4]:
def generate_security_prompt_hf(row: dict, 
                                instructions: bool = False, 
                                security_reminder: bool = False,
                                format_type: Optional[str] = None,
                                instruction_variation: Optional[int] = None) -> Tuple[str, str, str]:
    """
    Generate user prompt (X), positive example (y_positive), and negative example (y_negative).

    Args:
        row: A single data point from the dataset
        instructions: Whether to include general instructions
        security_reminder: Whether to include security policy reminder
        format_type: Type of prompt format ('structured', 'paragraph', 'minimal', 'conversational', None=random)
        instruction_variation: Specific instruction variation to use (None=random)

    Returns:
        tuple: (X, y_positive, y_negative)
    """
    
    # Select format type
    if format_type is None:
        format_type = random.choice(['structured', 'paragraph', 'conversational'])
    
    # Generate prompt based on format type
    if format_type == 'structured':
        # print("Format type:", format_type)
        X = generate_structured_prompt(row, instructions, security_reminder, instruction_variation)
    elif format_type == 'paragraph':
        # print("Format type:", format_type)
        X = generate_paragraph_prompt(row, instruction_variation)
    elif format_type == 'conversational':
        # print("Format type:", format_type)
        X = generate_conversational_prompt(row, instruction_variation)
    else:
        # Default to structured
        # print("[ELSE CASE] Format type:", format_type)
        X = generate_structured_prompt(row, instructions, security_reminder, instruction_variation)
    
    # Extract unittest and ground truth components
    unittest = literal_eval(row.get('unittest', {}))
    unittest_setup = unittest.get('setup', None) or ""
    
    ground_truth = literal_eval(row.get('ground_truth', {}))
    code_before = ground_truth.get('code_before', None) or ""
    code_after = ground_truth.get('code_after', None) or ""
    
    # Generate positive and negative examples
    y_positive = "<code>" + "\n" + unittest_setup + "\n" + code_before + "\n" + ground_truth.get('patched_code', '') + "\n" + code_after + "</code>"
    y_negative = "<code>" + "\n" + unittest_setup + "\n" + code_before + "\n" + ground_truth.get('vulnerable_code', '') + "\n" + code_after + "</code>"
    
    return X, y_positive, y_negative

In [5]:
# def generate_security_prompt_hf(row: dict, instructions: bool = False, security_reminder: bool = False) -> tuple:
#     """
#     Generate user prompt (X), positive example (y_positive), and negative example (y_negative).

#     Args:
#         row: A single data point from the dataset

#     Returns:
#         tuple: (X, y_positive, y_negative) where:
#             - X: User prompt for the LLM
#             - cot: Chain-of-thought reasoning steps
#             - y_positive: Secure/patched code (ground truth)
#             - y_negative: Vulnerable code (what to avoid)
#     """

#     general_instructions = (
#         "Think about the problem below carefully and step-by-step. "
#         "Then implement the code that meets the requirements described."
#     )

#     final_instructions = (
#         "Important: Write your reasoning steps within <think> and </think> tags. "
#         "And wrap your final code implementation within <code> and </code> tags.\n"
#         "Example format:\n"
#         "<think>"
#         "Your reasoning steps here..."
#         "</think>\n"
#         "<code>\n"
#         "Your final code implementation here...\n"
#         "</code>"
#     )

#     # Extract task description components
#     task_desc = literal_eval(row.get('task_description', {}))
#     function_name = task_desc.get('function_name') if task_desc.get('function_name') != "" else "None"
#     description = task_desc.get('description') if task_desc.get('description') != "" else "None"
#     arguments = task_desc.get('arguments') if task_desc.get('arguments') != "" else "None"
#     context = task_desc.get('context') if task_desc.get('context') != "" else "None"
#     return_info = task_desc.get('return') if task_desc.get('return') != "" else "None"
#     raise_info = task_desc.get('raise') if task_desc.get('raise') != "" else "None"
#     security_policy = task_desc.get('security_policy') if task_desc.get('security_policy') != "" else "None"

#     # Extract unittest components
#     unittest = literal_eval(row.get('unittest', {}))
#     unittest_setup = unittest.get('setup', None)

#     # Extract ground truth components
#     ground_truth = literal_eval(row.get('ground_truth', {}))
#     code_before = ground_truth.get('code_before', None)
#     code_after = ground_truth.get('code_after', None)

#     # Extract chain-of-thought reasoning steps
#     cot_steps = row.get('cot_steps', None)
#     cot_steps = "<think>" + cot_steps + "</think>\n" if cot_steps else ""

#     # Build the user prompt
#     prompt_parts = []

#     if instructions:
#         prompt_parts.append(f"### Instructions:\n{general_instructions}\n")
#     prompt_parts.append(f"**Description:**\n{description}\n")
#     prompt_parts.append(f"**Context:**\n{context}\n")
#     prompt_parts.append(f"**Function Name:** `{function_name}`\n")
#     prompt_parts.append(f"**Arguments:**\n{arguments}\n")
#     prompt_parts.append(f"**Returns:**\n{return_info}\n")
#     prompt_parts.append(f"**Raises:**\n{raise_info}\n")
#     if security_reminder:
#         prompt_parts.append(f"**Security Policy Reminder:**\n{security_policy}\n") # Default is False
#     prompt_parts.append(final_instructions + "\n")

#     # Combine all parts
#     X = "\n".join(prompt_parts)

#     # add code before and after for both positive and negative examples
#     y_positive = "<code>" + "\n" + unittest_setup + "\n" + code_before + "\n" + ground_truth.get('patched_code', '') + "\n" + code_after + "</code>"
#     y_negative = "<code>" + "\n" + unittest_setup + "\n" + code_before + "\n" + ground_truth.get('vulnerable_code', '') + "\n" + code_after + "</code>"

#     return X, y_positive, y_negative

In [6]:
# X, y_positive, y_negative = generate_security_prompt_hf(df_seccodeplt_sft.iloc[0])
# print(X)

In [7]:
def generate_cot_prompt(row: dict, 
                       format_type: Optional[str] = None,
                       instruction_variation: Optional[int] = None) -> str:
    """
    Generate reasoning prompt for the larger model.
    
    Args:
        row: Data row
        format_type: Format type for prompt
        instruction_variation: Instruction variation to use
    
    Returns:
        CoT generation prompt
    """
    
    X, y_positive, _ = generate_security_prompt_hf(
        row, 
        instructions=False,
        format_type=format_type,
        instruction_variation=instruction_variation
    )
    
    cot_prompt = f"""{X}

Here is the safe code implementation:
{y_positive}

Let's reason through this security problem step by step. Explain your thought process to solve the above problem securely.
Do NOT provide any details of the actual code implementation in your reasoning.
Only include the reasoning, no other text.
Important: Be concise and to the point in your reasoning. Think step by step.
"""
    
    return cot_prompt

_cot_stats = {'format': [], 'instruction': []}

def generate_cot(row: dict, 
                format_type: Optional[str] = None,
                instruction_variation: Optional[int] = None) -> str:
    """
    Generate CoT response from larger model.
    """
    global _cot_stats
    
    # Select actual values (after random if None)
    actual_format = format_type if format_type is not None else random.choice(['structured', 'paragraph', 'conversational'])
    actual_instruction = instruction_variation if instruction_variation is not None else random.randint(0, 9)
    
    # Track usage
    _cot_stats['format'].append(actual_format)
    _cot_stats['instruction'].append(actual_instruction)
    
    cot_prompt = generate_cot_prompt(row, actual_format, actual_instruction)
    llm_response, llm_response_text = llm_client.send_message(cot_prompt)
    
    return llm_response_text

# def generate_cot(row: dict, 
#                 format_type: Optional[str] = None,
#                 instruction_variation: Optional[int] = None) -> str:
#     """
#     Generate CoT response from larger model.
    
#     Args:
#         row: Data row
#         format_type: Format type for prompt
#         instruction_variation: Instruction variation to use
    
#     Returns:
#         CoT reasoning text
#     """
    
#     cot_prompt = generate_cot_prompt(row, format_type, instruction_variation)
#     llm_response, llm_response_text = llm_client.send_message(cot_prompt)
    
#     return llm_response_text


In [8]:
# def generate_cot_prompt(row: dict) -> str:
#     """Generate reasoning through guided questions."""
    
#     X, y_positive, _ = generate_security_prompt_hf(row, instructions=False)
    
#     cot_prompt = f"""{X}

# Here is the safe code implementation:
# {y_positive}

# Let's reason through this security problem step by step. Explain your thought process to solve the above problem securely.
# Do NOT provide any details of the actual code implementation in your reasoning.
# Only include the reasoning, no other text.
# Important: Be concise and to the point in your reasoning. Think step by step.
# """
    
#     return cot_prompt

# def generate_cot(row: dict) -> str:
#     """Generate CoT response from Gemini-2.5-Pro."""
    
#     cot_prompt = generate_cot_prompt(row)
#     llm_response, llm_response_text = llm_client.send_message(cot_prompt)
    
#     return llm_response_text

In [9]:
# for i in range(5):
#     llm_response_text = generate_cot(df_seccodeplt_sft.iloc[i])
#     print(llm_response_text)

In [10]:
cot_prompts = df_seccodeplt_sft.progress_apply(generate_cot, axis=1)

100%|██████████| 625/625 [30:52<00:00,  2.96s/it] 


In [11]:
cot_prompts

0      \n1. **Directory Check**: Ensure the directory...
1      \n1. **Input Validation**: The first step is t...
2      \n1. **Define the Download Directory**: Establ...
3      \n1. Identify the need for dynamic HTML genera...
4      \n1. Identify the data classes: We have `Emplo...
                             ...                        
620    1. Identify the purpose: The function `create_...
621    Step 1: Identify the directories where configu...
622    1. **Directory Creation:** Ensure the director...
623    Step 1: Identify the user's directory based on...
624    To solve the problem of generating a secure pa...
Length: 625, dtype: object

In [12]:
df_seccodeplt_sft["cot_steps"] = cot_prompts

In [13]:
def display_cot_stats():
    """Display statistics after generation."""
    from collections import Counter
    
    format_counts = Counter(_cot_stats['format'])
    instruction_counts = Counter(_cot_stats['instruction'])
    total = len(_cot_stats['format'])
    
    print("=" * 60)
    print("📊 FORMAT TYPE DISTRIBUTION:")
    for fmt, count in sorted(format_counts.items()):
        print(f"  {fmt:15s}: {count:5d} ({count/total*100:5.1f}%)")
    
    print("\n📝 INSTRUCTION VARIATION DISTRIBUTION:")
    for var, count in sorted(instruction_counts.items()):
        print(f"  Variation {var:2d}: {count:5d} ({count/total*100:5.1f}%)")
    print("=" * 60)

display_cot_stats()

📊 FORMAT TYPE DISTRIBUTION:
  conversational :   219 ( 35.0%)
  paragraph      :   209 ( 33.4%)
  structured     :   197 ( 31.5%)

📝 INSTRUCTION VARIATION DISTRIBUTION:
  Variation  0:    52 (  8.3%)
  Variation  1:    53 (  8.5%)
  Variation  2:    75 ( 12.0%)
  Variation  3:    59 (  9.4%)
  Variation  4:    67 ( 10.7%)
  Variation  5:    59 (  9.4%)
  Variation  6:    51 (  8.2%)
  Variation  7:    65 ( 10.4%)
  Variation  8:    65 ( 10.4%)
  Variation  9:    79 ( 12.6%)


In [14]:
df_seccodeplt_sft.to_csv("seccodeplt_sft_data_with_cot_permutations.csv", index=False)