In [7]:
from collections import defaultdict
from openai import OpenAI
from dotenv import load_dotenv
import os
import tiktoken
import pandas as pd
import heapq

client = OpenAI(
    # This is the default and can be omitted
    api_key=input("Enter your OpenAI ChatGPT API Key")
)

MODEL = "gpt-3.5-turbo" # or "gpt-4"
CODE_PATH = os.getcwd() + "/Activity_to_Goal"

enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(MODEL)
def get_token_length(text):
    return len(enc.encode(text))

# Load in data files into dataframe
task = None
goal = None
try:
    task = pd.read_csv(f'{CODE_PATH}/Task.csv', keep_default_na=False)
    goal = pd.read_csv(f'{CODE_PATH}/Goal.csv', keep_default_na=False)
    dmn_demand = pd.read_csv(f'{CODE_PATH}/dmn_demand.csv', keep_default_na=False)
    pm_project = pd.read_csv(f'{CODE_PATH}/pm_project.csv', keep_default_na=False)
    # print(f">>> Available Task features:", list(df_task.columns))
    # print(f">>> Available Goal features:", list(df_goal.columns))
    # print(f">>> Example 3 data points: \n", df_task.head(3))
except:
    raise Warning(f">>> Your dataset is NOT ready for the next step. Fix this first.")

task["TaskDescript"] = task['ProjectName'] + "%%" + task['ActivityName'] + "%%" + task['description']
goal = goal[(goal['ParentObjective'] != 'NULL')]
goal = goal[goal['Level'].str[0].astype(int) > 1]
goals = goal['DisplayName'].drop_duplicates().values
descriptions = task['TaskDescript'].values

KeyboardInterrupt: Interrupted by user

In [2]:
PROMPT = '''I want you to act like a experienced data scientist. I will provide to a description of a cybersecurity project task and a list of goals of a cybersecurity team and it will be your task \n
to estimate the probabilities that the description of the project activity in Set 1 alligns with the project goal descriptions in Set 2. 
Let's do it step by step.\n
1. First, represent the state of the goal list explicitly. Focus on extracting relevant information that describes the tasks and processes involved in each project. If necessary, pay attention to \n
specific keywords or phrases that indicate the nature of the project and mention them. If too long, focus on the lines closest to the start and end of the list. \n
2. Next, for the project task, recover the semantic meaning of each token in the task based on the descriptions of the functionality or purpose of the project. Include keywords  \n
or phrases that indicate the nature of the project or any other relevant information. \n
3. Given your understanding of the explicit goal list (step 1) and the task's semantic meanings (step 2),provide a concise description of the functionality or purpose of the project. 
4.  Finally, based on previous steps, estimate the probabilities that task described in the project description align with the goals mentioned. \n
Consider the extracted information and semantic meanings to make informed matches. For each project, determine the goal it aims to achieve and assign a probability to the match. \n
Please provide the model's predictions in the format Task:Goal (e.g., 1:B, 2:A). Enclose probability matchings with "[" and "]" \n
'''

In [3]:
#### !!![Important] Rerun this cell each time when resetting the task descriptions, otherwise system will continue popping off items from the heap

# Get the number of tokens for each task description
token_length_map = {}
prompt_length = get_token_length(PROMPT)
goals_str = ",".join(goals)
goals_length = get_token_length(goals_str)
print(f"Prompt Length: {prompt_length}")
print(f"Goals Length: {goals_length}")

for i in range(len(descriptions)):
    text = descriptions[i]
    token_length = get_token_length(text)
    token_length_map[i] = token_length

heap = [(-value, key) for key,value in token_length_map.items()]
heapq.heapify(heap)

Prompt Length: 329
Goals Length: 927


In [4]:
def summarize (text):
    # Summarize the descriptions with ChatGPT 
    message = [
    {"role": "system", "content": f"Summarize the text. Include keywords or phrases that indicate the nature of the project or any other relevant information. If there is a numbered list in the description, make to include the structure in the summary."},
    {"role": "user", "content": f"Below is the given list of project tasks and descriptions: {raw}"}
    ]
    completion = client.chat.completions.create(
        model = MODEL,
        messages = message
    )
    print(f"Raw Description:\n {raw}")
    summary = completion.choices[0].message.content
    print(f"Summary:\n {summary}")
    return summary

In [5]:
'''Match project goals and task descriptions using ChatGPT API'''

def match_goals_and_descriptions(description, id):
    project, activity, summary = description.split('%%')
    # print(f"{project}\n{activity}\n{description}")
    message = [
        {"role": "system", "content": PROMPT},
        {"role": "user", "content": f"Below I have a project with the project name: \"{project}\", activity name: \"{activity}\" and a long description like: \"{summary}\""},
        {"role": "user", "content": f"Below is the given list of project goals: {goals}"}
    ]
    completion = client.chat.completions.create(
        model = MODEL,
        messages = message
    )
    if not os.path.exists(MODEL):
        os.makedirs(MODEL)
    with open(f"{MODEL}/v1/{id}-task-goal-match.txt", "w") as f:
        content = completion.choices[0].message.content
        f.write(content)

In [6]:
description_length = prompt_length + goals_length
description = ""; i = 0 
# Create a mapping between a project's reference number and a seperate ChatGPT session 

# Loop through the descriptions until reaching the max token length
# We filter out the descriptions that are too long, leaving only the ones < 4096 tokens
while heap:
    raw_token_length, id = heapq.heappop(heap)
    description = descriptions[id]

    # Summarize the text with ChatGPT if the task description is > 2000 tokens
    # If not, take the raw description as it is
    if raw_token_length > 2000:
        description  = summarize(raw)
        summary_token_length = get_token_length(description)

    match_goals_and_descriptions(description, i)
    i += 1


KeyboardInterrupt: 

In [None]:
print(reference_message_apis.keys())