# Prompt generation using ChatGPT

In [1]:
# Generate a base prompt from the description
import pandas as pd
import sys
import os
import openai
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils.prompts import prompt_template_description_dict, META_PROMPT, prompt_template_COT_dict

for k,v in prompt_template_description_dict.items():
    print(f"Key: {k}")
    print(f"Value: {v}")

Key: instructions
Value: Your task is to update a well-structured provided prompt for a text classification task based on category descriptions. Ensure clarity and precision in the prompt so the language model can accurately classify the input. Incorporate category descriptions to guide the classification process effectively. Maintain a structured format to enhance understanding and consistency. Keep original '{input}' string at the end in the generated prompt.
Key: additional_details
Value: 
Category Descriptions:
 {labels_description}
Key: output_format
Value: The output should be a JSON format with the updated prompt and the refinement reason.
Key: examples
Value: {"updated_prompt": "content of updated prompt", "reason": "reason for the refinement"}



In [4]:
import json

with open(".vscode/.openai_api_token", "r") as file:
    api_key = file.read().strip()  # Remove any extra spaces or newlines
    
client = openai.OpenAI(api_key = api_key)

# Call the chatGPT API to generate a prompt from the description
def generate_prompt_from_description(data_path, samples=2):
    # Load the data from the JSON file
    data = pd.read_json(data_path, lines=True)
    example_data = data.groupby("cls_label").sample(samples)

    # Ensure the necessary columns exist
    if "cls_label" not in example_data.columns or "description" not in example_data.columns:
        raise ValueError("The input data must contain 'label' and 'description' columns.")
    
    labels_description = ""
    init_prompt = "Please classify the following input text into the appropriate category:\n\n {input}"
    for idx, row in example_data.iterrows():
        label = row["cls_label"]
        description = row["description"]
        example_content = row["content"]
        labels_description += f"Label: {label}\nDescription: {description}\n"

    previous_prompt = init_prompt
    gpt_prompt = META_PROMPT.format(
        instructions=prompt_template_description_dict["instructions"], 
        additional_details=prompt_template_description_dict["additional_details"].format(labels_description = labels_description),
        previous_prompt = previous_prompt,
        output_format=prompt_template_description_dict["output_format"],
        examples = prompt_template_description_dict["examples"]
        )
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": gpt_prompt}
        ],
        temperature=0.7,
    )

    content = response.choices[0].message.content  # This is a JSON string
    content_dict = json.loads(content)

    return content_dict


def save_to_jsonl(content_dict, filename="output.jsonl"):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(json.dumps(content_dict, ensure_ascii=False) + "\n")


input_datasets_path = "assets/LDD_split.json"
content_dict = generate_prompt_from_description(input_datasets_path)
output_file_name = f"utils/prompt_database/{input_datasets_path.split('/')[-1].split('.')[0]}_prompt_from_description.jsonl"
save_to_jsonl(content_dict, output_file_name)
print(f"Prompt saved to {output_file_name}")
print(f"Prompt generated: {content_dict['updated_prompt']}")


Prompt saved to utils/prompt_database/LDD_split_prompt_from_description.jsonl
Prompt generated: To accurately classify the input text, first understand its content and context, then match it to the most appropriate category based on the descriptions provided below:

- **cs.AI**: Involves topics related to Artificial Intelligence.
- **cs.CE**: Related to Computational Engineering.
- **cs.CV**: Pertains to Computer Vision.
- **cs.DS**: Concerns Data Structures.
- **cs.IT**: Deals with Information Theory.
- **cs.NE**: Focuses on Neural and Evolutionary Computing.
- **cs.PL**: Involves Programming Languages.
- **cs.SY**: Related to Systems and Control.
- **math.AC**: Pertains to Commutative Algebra.
- **math.GR**: Involves Group Theory.
- **math.ST**: Related to Statistics Theory.

Please classify the following input text into the appropriate category based on the descriptions above:

{input}


## Generate a COT prompt from the data examples

In [10]:
import json
from utils.prompts import prompt_template_description_dict, META_PROMPT, prompt_template_COT_dict

with open(".vscode/.openai_api_token", "r") as file:
    api_key = file.read().strip()  # Remove any extra spaces or newlines
    
client = openai.OpenAI(api_key = api_key)

# Call the chatGPT API to generate a prompt from the description
def generate_COT_prompt_from_description(data_path, samples=2):
    # Load the data from the JSON file
    data = pd.read_json(data_path, lines=True)
    example_data = data.groupby("cls_label").sample(samples)

    # Ensure the necessary columns exist
    if "cls_label" not in example_data.columns or "description" not in example_data.columns:
        raise ValueError("The input data must contain 'label' and 'description' columns.")
    
    labels_description = ""
    init_prompt = "Please classify the following input text into the appropriate category:\n\n {input}"
    previous_prompt = init_prompt
    for idx, row in example_data.iterrows():
        label = row["cls_label"]
        description = row["description"]
        example_content = row["content"]
        labels_description += f"Label: {label}\nDescription: {description}\n\n"

        gpt_prompt = META_PROMPT.format(
            instructions=prompt_template_COT_dict["instructions"], 
            additional_details=prompt_template_COT_dict["additional_details"].format(labels_description = labels_description),
            previous_prompt = previous_prompt,
            output_format=prompt_template_COT_dict["output_format"],
            examples = prompt_template_COT_dict["examples"]
            )
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": gpt_prompt}
            ],
            temperature=0.7,
        )

        content = response.choices[0].message.content  # This is a JSON string
        content = content.replace("```","")
        content = content.replace("json","")
        content_dict = json.loads(content)
        previous_prompt = content_dict["updated_prompt"]

    return content_dict


def save_to_jsonl(content_dict, filename="output.jsonl"):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(json.dumps(content_dict, ensure_ascii=False) + "\n")


input_datasets_path = "assets/LDD_split.json"
content_dict = generate_COT_prompt_from_description(input_datasets_path,2)
output_file_name = f"utils/prompt_database/{input_datasets_path.split('/')[-1].split('.')[0]}_COT_prompt_from_description.jsonl"
save_to_jsonl(content_dict, output_file_name)
print(f"Prompt saved to {output_file_name}")
print(f"Prompt generated: {content_dict['updated_prompt']}")


Prompt saved to utils/prompt_database/LDD_split_COT_prompt_from_description.jsonl
Prompt generated: To ensure accurate classification of the input text into the appropriate category, follow these structured reasoning steps: 
1. **Analyze Input Text**: Begin by closely examining the input text. Identify key ideas and specific terminologies that relate closely to the categories listed below. Pay attention to unique concepts or phrases indicative of particular fields.
2. **Review Category Descriptions**: Read and understand the scope and focus of each category. Here are the descriptions to guide you:
   - 'cs.AI': Focuses on Artificial Intelligence, involving algorithms, machine learning techniques, and AI applications.
   - 'cs.CE': Encompasses Computational Engineering, including simulations, system design, and computational methods in engineering.
   - 'cs.CV': Pertains to Computer Vision, dealing with image recognition, processing, and computer-based visual tasks.
   - 'cs.DS': Involv

## Generate a Few Shot Prompt from the data examples

In [None]:

import json
from utils.prompts import prompt_template_few_shot_dict
# Generate a base prompt from the description
import pandas as pd
import sys
import os
import openai
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils.prompts import prompt_template_description_dict, META_PROMPT, prompt_template_COT_dict, META_PROMPT_JSON_FREE


with open(".vscode/.openai_api_token", "r") as file:
    api_key = file.read().strip()  # Remove any extra spaces or newlines
    
client = openai.OpenAI(api_key = api_key)

# Call the chatGPT API to generate a prompt from the description
def generate_few_shot_prompt_from_description(data_path, samples=2):
    # Load the data from the JSON file
    data = pd.read_json(data_path, lines=True)
    example_data = data.groupby("cls_label").sample(samples)

    # Ensure the necessary columns exist
    if "cls_label" not in example_data.columns or "description" not in example_data.columns:
        raise ValueError("The input data must contain 'label' and 'description' columns.")
    
    labels_description = ""
    init_prompt = "Please classify the following input text into the appropriate category:\n\n {input}"
    previous_prompt = init_prompt
    i = 0
    for idx, row in example_data.iterrows():
        label = row["cls_label"]
        description = row["description"]
        example_content = row["content"]
        summ_prompt = f"Please summarize the following content in 200 words:\n\n{example_content}"

        summarized_content = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summ_prompt}
            ],
            temperature=0.7,
        ).choices[0].message.content

        labels_description += f"Label {i}: {label}\nDescription {i}: {description}\nContent {i}: {summarized_content}\n\n"
        i+=1
        
    gpt_prompt = META_PROMPT_JSON_FREE.format(
        instructions=prompt_template_few_shot_dict["instructions"], 
        additional_details=prompt_template_few_shot_dict["additional_details"].format(labels_description = labels_description),
        previous_prompt = previous_prompt,
        output_format=prompt_template_few_shot_dict["output_format"]
        )
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": gpt_prompt}
        ],
        temperature=0.7,
    )

    content = response.choices[0].message.content  # This is a JSON string
    previous_prompt = content
    print (previous_prompt)
    return content


def save_to_jsonl(content_dict, filename="output.jsonl"):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(json.dumps(content_dict, ensure_ascii=False) + "\n")


input_datasets_path = "assets/LDD_split.json"
content = generate_few_shot_prompt_from_description(input_datasets_path)
content_dict = {"updated_prompt": content}
output_file_name = f"utils/prompt_database/{input_datasets_path.split('/')[-1].split('.')[0]}_few_shot_prompt_from_description.jsonl"
save_to_jsonl(content_dict, output_file_name)
print(f"Prompt saved to {output_file_name}")
print(f"Prompt generated: {content_dict['updated_prompt']}")


# Refined Few-Shot Prompt for Text Classification Task

## Task Description:
The objective of this task is to classify a given text into a specific category based on its content. The categories are predefined, each associated with a specific field of study or topic. The classifier needs to analyze the content of the text and determine which category it best fits based on the descriptions provided.

## Category Descriptions:
- **Label:** cs.AI  
  **Description:** Artificial Intelligence  
  **Content:** Articles, reports, or studies primarily focused on advancements, methodologies, applications, and theoretical aspects of artificial intelligence.

## Instructions for Classification:
1. **Read and Understand the Text:** Start by thoroughly reading the input text.
2. **Analyze the Content:** Evaluate the content against the category description provided. Focus on the main subject, methodologies, applications, and the field of study discussed.
3. **Match with Category:** Determine which c