### Method
1. Load the libraries
2. Define the list of all competencies (must be in a free text format following the provided structure)
3. Define the dictionary of competencies that you want generated.
4. Generate the data

In [None]:
#Import libraries 
import regex as re
from tqdm import tqdm
import json
import random
import numpy as np

#For OpenAI
import openai
from openai import OpenAI
import os

In [None]:
# Example Competence list
competencies = [
    """
        match_1 = o(r'\\butility\s+meter\w*', text)
        match_2 = o(r'\\bmetering\w*|\\bmeter\s+read\w*', text)
        match_3 = o(r'\\butility\w*', text)
        if match_1:
        elif match_2 and match_3:
        label = 'report utility meter readings'
    """,
    """
        match_1 = o(r'\\butility\s+meter\w*|\\bgas\s+meter\w*|\\belectricity\s+meter\w*|\\bheat[i]?[n]?[g]?\s+meter\w*|\\belectric\s+meter\w*|\\belectrical\s+meter\w*|\\bwater\s+meter\w*', text)
        match_2 = o(r'\\bfault\w*|\\bissue\w*|\\bproblem\w*', text)
        if match_1 and match_2:
        label = 'identify faults in utility meter meters'
    """,
    """
        match_1 = o(r'\\butility\s+payment\w*|\\bgas\s+payment\w*|\\bheat\s+payment\w*|\\bheating\s+payment\w*|\\belectric\s+payment\w*|\\belectricity\s+payment\w*|\\butility\s+bill\w*|\\bgas\s+bill\w*|\\bheat\s+bill\w*|\\bheating\s+bill\w*|\\belectric\s+bill\w*|\\belectricity\s+bill\w*|\\butility\s+invoice\w*|\\bgas\s+invoice\w*|\\bheat\s+invoice\w*|\\bheating\s+invoice\w*|\\belectric\s+invoice\w*|\\belectricity\s+invoice\w*', text)
        if match_1:
        label = 'calculate utility payments'
    """,
    """
        match_1 = o(r'\\bwater\s+meter\w*', text)
        match_2 = o(r'\\bmetering\w*|\\bmeter\s+read\w*|\\bread[i]?[n]?[g]?\s+meter\w*', text)
        match_3 = o(r'\\bwater\w*', text)
        if match_1:
        elif match_2 and match_3:
        label = 'read water meter'
    """
]

In [None]:
#Function to extract the rules and labels from list
def parse_skill_list(skill_list):
    skill_dict = {}
    
    for element in skill_list:
        #get label
        label_match = re.search(r"label\s*=\s*'(.*?)'", element)
        if label_match:
            label = label_match.group(1)
        else:
            continue

        #Extract all match definitions and conditions before the label definition
        match_and_conditions = re.split(r"label\s*=\s*'.*?'", element)[0].strip()
        skill_dict[label] = match_and_conditions

    return skill_dict

In [None]:
#Get the list of competencies to generate
#This can differ from the orignal competencies list in terms of the requested competencies (you might not want all)
label_pattern = r"label\s*=\s*'([^']*)'"

# Extract all label values
competencies_to_gen = {}
for item in competencies:
    match = re.search(label_pattern, item)
    if match:
        label = match.group(1)
        to_generate = 10
        
        competencies_to_gen_single ={
            label : to_generate
        }
    competencies_to_gen.update(competencies_to_gen_single)

In [None]:
#This function creates a list of dictionaries ready to be processed by the generation pipeline
def combine_generations(raw_rules, missing_entries):
    list_of_dict = []
    for label_missing, no_examples in missing_entries.items():
        for label, rules in raw_rules.items():
            if label_missing == label:
                dictionary = {
                'label': label,
                'rules': rules,
                'to_generate': no_examples
                }
        list_of_dict.append(dictionary)
    return list_of_dict 

In [None]:
#Process the competencies
competence_rules = parse_skill_list(competencies)

In [None]:
#Define a list of dictionaries 
combined_list_all = combine_generations(competence_rules, competencies_to_gen)

In [None]:
#!!!YOU NEED AN OPENAI ACCOUNT TO USE!!!
#Make sure to get your API key and set it in the environet (refer to: https://platform.openai.com/docs/api-reference/introduction) 
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
starter = "You are an Human Resources worker and you were tasked with compiling the fragments of job postings based on the specified competencies. The generated sentences must resemble examples seen in actual job postings and can also request other competencies in addition to the one specified. You always return the generated sentences in a JSON format using a key 'sentences'. Generated sequences must not exceed 25 tokens and must always correspond to the provided rules variations."

In [None]:
def generate_sentences_with_word(dictioanry):
    output_list = []
    
    for item in tqdm(dictioanry):
        label = item['label']
        rules = item['rules']
        sentences_to_gen = item['to_generate']
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": starter},
                {"role": "user", "content": f"Generate a set of {sentences_to_gen} sentences describing the {label} competence. Base your generation on one of the below rules variations: {rules}. The output must be in the Python list format and use key 'sentences'."}
            ],
            max_tokens=250,
            response_format={"type": "json_object"}
        )
        
        input_data = response.choices[0].message.content
        try:
            data = json.loads(input_data)
            sentences = data['sentences']
        except:
            sentences = ['The function failed for this word']
        output_dict = {
            "competence": label,
            "examples": sentences, 
        }
        output_list.append(output_dict)
    
    return output_list

In [None]:
#The output from the function above should be the list of dictionairies. It is possible that generation errors occur, thus inspect your data for "The function failed for this word" examples. 
#For a good balance between effciency and generation contorl we recommend generatign for up to 100 cxompetencies at a single time.  