<h1 align ="center"> Dynamic Prompting for task completion</h1>
<hr>

Recent papers such as [Do Prompt-Based Models Really Understand the Meaning of their Prompts?](https://arxiv.org/abs/2109.01247) and [What Makes Good In-Context Examples for GPT-3?](https://aclanthology.org/2022.deelio-1.10.pdf) have shown that using dynamic set of examples instead of fixed set of examples help GPT-3 to perfom the task with higher accuracy.

In [1]:
# if needed, upgrade to the latest version of the OpenAI Python library
%pip install --upgrade openai
%pip install --upgrade torch
%pip install --upgrade sentence_transformers
%pip install --upgrade numpy
%pip install --upgrade datasets
%pip install --upgrade scikit-learn

In [2]:
# import os module & the OpenAI Python library for calling the OpenAI API
# please make sure you have installed required libraries via pip install -r requirements.txt
import os
import openai
import json
from sentence_transformers import SentenceTransformer, util
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Load Dataset

In [3]:
# load dataset from Huggingface's dataset library
dataset = load_dataset("trec")
# name of the text and label column
label_type = 'coarse_label'
text_key = "text"
# create mapping of ids2class and class2id
id2class = dict((i, label) for i, label in enumerate(dataset['train'].features[label_type].names))
class2id = dict((label, i) for i, label in enumerate(dataset['train'].features[label_type].names))
# create a dictionary with classes as key and containing all the training examples within that class
class2TrainDataset = dict((label, []) for label in dataset['train'].features[label_type].names)
for example in dataset['train']:
    label = id2class[example[label_type]]
    class2TrainDataset[label].append(example[text_key])

# Task Prompt

In [4]:
# a prompt for asking LLM to perform a task
task_prompt = "As a Question Answering agent, your goal is to categorize questions into different semantic classes that impose constraints on potential answers, so that they can be utilized in later stages of the question answering process.\nFollowing are the semantic classes: ["
task_prompt += ", ".join([label for label in class2TrainDataset]) + "]"
# a prompt for asking LLM to generate the output for current task
query_prompt = "\nClassify the following question into one of the above classes.\nquestion: "
answer_prompt = "\noutput: "

# Setup OpenAI APIAAPI

In [5]:
# Load config values
with open(r'config.json') as config_file:
    config_details = json.load(config_file)
    
# Setting up the deployment name
model_name = config_details['COMPLETIONS_MODEL']

# This is set to `azure`
openai.api_type = "azure"

# The API key for your Azure OpenAI resource.
openai.api_key = os.getenv("OPENAI_API_KEY")

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai.api_base = config_details['OPENAI_API_BASE']

# Currently OPENAI API have the following versions available: 2022-12-01
openai.api_version = config_details['OPENAI_API_VERSION']

In [6]:
# Text completion using GPT
def trim_text(text):
    return text.strip().strip('\n').strip('\\n')
    
def generate_using_gpt(prompt):
    generated_sentence = ""
    response = openai.Completion.create(
        engine=model_name,
        prompt=prompt, 
        max_tokens=3,
        temperature=0,
        top_p=1,
        stop=None,
        frequency_penalty=0,
        presence_penalty=0.0)
    choices = response.get("choices", "")
    if len(choices) == 0 or "text" not in choices[0]:
        generated_sentence
    generated_sentence = choices[0]["text"].lstrip('\\n').rstrip('\\n').lstrip('\n\n').rstrip('\n\n').lstrip('\n').rstrip('\n')
    return generated_sentence

# Zero-shot Prompt

In [7]:
# prompt without any examples from the training dataset
labels = []
predictions = []
for example in dataset['test']:
    zeroshot_prompt = task_prompt +  query_prompt + example[text_key] + answer_prompt
    pred = generate_using_gpt(zeroshot_prompt)
    pred=trim_text(pred)
    labels.append(example[label_type])
    predictions.append(class2id[pred])
        
report = classification_report(labels, predictions) 

In [8]:
print(report)

              precision    recall  f1-score   support

           0       0.69      1.00      0.82         9
           1       0.28      0.68      0.40        94
           2       0.77      0.14      0.24       138
           3       0.82      0.22      0.34        65
           4       0.66      0.90      0.76        81
           5       0.81      0.78      0.80       113

    accuracy                           0.54       500
   macro avg       0.67      0.62      0.56       500
weighted avg       0.68      0.54      0.51       500



# Few-shot Prompt

In [9]:
# function to selection few examples in each of the classes from the training dataset
def generateFewshotPrompt(class2TrainDataset, N=3):
    fewshot_prompt = "\nFollowing are some examples."
    for label in class2TrainDataset:
        for example in class2TrainDataset[label][:N]:
            fewshot_prompt += "\nquestion: " + example
            fewshot_prompt += "\noutput: " + label
    return fewshot_prompt

In [10]:
# prompt is created by adding one example in each of the classes 
labels = []
predictions = []
fewshot_examples = generateFewshotPrompt(class2TrainDataset, N=1)
for example in dataset['test']:
    fewshot_prompt = task_prompt + fewshot_examples + query_prompt + example[text_key] + answer_prompt
    pred = generate_using_gpt(fewshot_prompt)
    pred=trim_text(pred)
    labels.append(example[label_type])
    predictions.append(class2id[pred])
        
report = classification_report(labels, predictions) 

In [11]:
print(report)

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         9
           1       0.39      0.74      0.51        94
           2       0.87      0.42      0.57       138
           3       0.91      0.45      0.60        65
           4       0.95      0.88      0.91        81
           5       0.84      1.00      0.91       113

    accuracy                           0.70       500
   macro avg       0.78      0.75      0.73       500
weighted avg       0.79      0.70      0.70       500



# Extract Embeddings for Training dataset

In [12]:
# loading Sentence Transformer based model
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# extract embeddings for a set of examples
def ExtractEmbeddings(examples):
    embedding_ls = []
    for example in examples:
        embedding = model.encode(example)     
        embedding_ls.append(embedding)
    return embedding_ls

# extract embeddings for all the training examples
class2TrainDatasetWithEmbedding = {}
for label in class2TrainDataset:
    embeddings = ExtractEmbeddings(class2TrainDataset[label])
    class2TrainDatasetWithEmbedding[label] = [class2TrainDataset[label], embeddings]

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Dynamic Few-shot Prompt

In [13]:
# extract similar queries for a given input text from each of the classes
def getSimilarExamples(input_text, dataset, dataset_embedding):
    input_embedding = model.encode(input_text)
    sim_score = util.dot_score(input_embedding, dataset_embedding)[0]
    topN_ids = np.argsort(-sim_score)
    return [dataset[i] for i in topN_ids]
    
def getClasswiseSimilarExamples(input_text, class2TrainDatasetWithEmbedding):
    classwiseSimilarExamples = {}
    for label in class2TrainDataset:
        similarExamples = getSimilarExamples(input_text, class2TrainDatasetWithEmbedding[label][0], class2TrainDatasetWithEmbedding[label][1])
        classwiseSimilarExamples[label] = similarExamples
    return classwiseSimilarExamples

In [14]:
# generate a prompt with similar examples in each of the classes
def generateDynamicPrompt(input_text, class2TrainDatasetWithEmbedding, N=3):
    classwiseSimilarExamples = getClasswiseSimilarExamples(input_text, class2TrainDatasetWithEmbedding)
    dynamic_prompt = "\nFollowing are some examples."
    for label in classwiseSimilarExamples:
        for example in classwiseSimilarExamples[label][:N]:
            dynamic_prompt += "\nquestion: " + example
            dynamic_prompt += "\noutput: " + label
    return dynamic_prompt

In [15]:
labels = []
predictions = []
for example in dataset['test']:
    fewshot_examples = generateDynamicPrompt(example[text_key], class2TrainDatasetWithEmbedding, N=1)
    dynamic_prompt = task_prompt + fewshot_examples + query_prompt + example[text_key] + answer_prompt
    pred = generate_using_gpt3(dynamic_prompt)
    pred=trim_text(pred)
    labels.append(example[label_type])
    predictions.append(class2id[pred])
        
report = classification_report(labels, predictions) 

In [16]:
print(report)

              precision    recall  f1-score   support

           0       0.69      1.00      0.82         9
           1       0.61      0.80      0.69        94
           2       0.88      0.68      0.77       138
           3       0.95      0.88      0.91        65
           4       0.93      0.86      0.90        81
           5       0.90      0.98      0.94       113

    accuracy                           0.83       500
   macro avg       0.83      0.87      0.84       500
weighted avg       0.85      0.83      0.83       500

