# Loading

In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import openai
import string
import json
import copy

import os
from dotenv import load_dotenv


In [None]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY_1")
model="gpt-3.5-turbo"

client = OpenAI(
    api_key=openai_api_key
)

try:
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "Ping"}]
    )
    print("Success:", resp.choices[0].message.content)
except Exception as e:
    # <-- note the closing ')' here
    print("Request failed:", type(e).__name__, e)

# Seniority

In [3]:
train_df = pd.read_csv('../data/seniority_labelled_development_set_cleaned.csv')
test_df = pd.read_csv('../data/seniority_labelled_test_set_cleaned.csv')

In [4]:
train_df.iloc[1131]

job_id                                                             33027740
job_title                                                   Fitter & Welder
job_summary               A food manufacturing company has an opportunit...
job_ad_details            A well known food manufacturer operating since...
classification_name                    Manufacturing, Transport & Logistics
subclassification_name                                                Other
y_true                                                          experienced
Name: 1131, dtype: object

# RBIC Functions

### RB


In [5]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label."},
        {"role": "user", "content": "Based on your role, can you briefly explain what seniority means, and what seniority labels look like?"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

In [6]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = client.chat.completions.create(
    #     model=model,
    #     messages=messages
    # )
    # messages.append({"role": "assistant", "content": response.choices[0].message.content})

    few_shot_indices = [2081,1,5,12,3]

    for i in few_shot_indices:
        desc = {
          "job_title": train_df.iloc[i].job_title,
          "job_summary": train_df.iloc[i].job_summary,
          "job_ad_details": train_df.iloc[i].job_ad_details,
          "classification_name": train_df.iloc[i].classification_name,
          "subclassification_name": train_df.iloc[i].subclassification_name
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [7]:
#== step 2: setting sub-task --> ask for seniority patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a seniority label predictor, what are some common phrases or patterns that indicate a seniority label in a job description?"
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

### IC

In [None]:
#== step 3: presence of seniority
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any seniority-related information? Just respond with 'Yes' or 'No'."
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    p3_content = response.choices[0].message.content

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [9]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    messages.append({
            "role": "user",
            "content": "Extract the seniority-related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
        })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})
    #print(f"step4 messages: {len(messages)}")

    return response.choices[0].message.content, messages

In [None]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted seniority clue, return a structured seniority label in the format {\"seniority_label\": \"seniority label\"}. "
                        
        )
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    print(messages)

    return answer_str

In [11]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [32]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    answer_str = step_5(messages_local)

    #print("data:",answer_str,"&&&")

    data = json.loads(answer_str)
    

    # extract the value
    label = data["seniority_label"]
    
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return str(label)

# Testing

### Qualitative Tests

In [13]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: Seniority in the context of job roles generally refers to the level of experience, responsibility, and authority an individual has within an organization. Seniority labels in job descriptions typically indicate the level of experience and expertise required for a particular position. These labels can range from entry-level for beginners to executive for top-level management positions. Other common seniority labels include junior, intermediate, senior, lead, director, manager, specialist, and coordinator, among others. The specific seniority label used in a job description can give candidates a clear indication of the expectations and requirements for the role.

RB step 2: 1. "Senior" or "senior-level": Indicates a higher level of experience and responsibility.
2. "Lead" or "lead role": Implies leading a team or project.
3. "Manager" or "management": Suggests a supervisory or leadership role.
4. "Director" or "director-level": Indicates a high-ranking position with strategic 

In [None]:
ind = 395

#[2081,1,5,12,3]

desc = {
    "job_title": test_df.iloc[ind].job_title,
    "job_summary": test_df.iloc[ind].job_summary,
    "job_ad_details": test_df.iloc[ind].job_ad_details,
    "classification_name": test_df.iloc[ind].classification_name,
    "subclassification_name": test_df.iloc[ind].subclassification_name,
  }
desc_str = str(desc)

label = test_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

In [61]:
test_df.iloc[ind].job_ad_details

'Company description:We’re an energy delivery service that owns and operates Victoria’s largest network of electricity and gas infrastructure. We move energy through these networks from where it’s made to where it’s used in more than one million homes and businesses. What we do matters to our nation. At AusNet Services, you’re part of a diverse, inclusive and collaborative team that’s dedicated to bringing safe, efficient and reliable energy to millions of Australians, and you can be certain that safety comes first in everything we do.Job description:What you will be delivering: This great opportunity is solution leadership role within the Enterprise Information Management (IM) CoE and has responsibility for ensuring a robust and fit for purpose IM solution architecture that meets current and future needs of our business.You will work closely with the business and technology stakeholders, ensure best practice and robust solution design and governance for the IM platform. Your hands-on 

### Quantitative Tests

In [None]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])
test_pred_df



Unnamed: 0,y_pred


In [55]:
test_pred_df2 = pd.DataFrame(columns=["y_pred"])
test_pred_df2


Unnamed: 0,y_pred


In [56]:
for i in tqdm(range(548,len(test_df))):
    if (i==395):
        continue
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_summary": test_df.iloc[i].job_summary,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "classification_name": test_df.iloc[i].classification_name,
        "subclassification_name": test_df.iloc[i].subclassification_name
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df2.loc[len(test_pred_df2)] = label_pred



100%|██████████| 141/141 [05:08<00:00,  2.19s/it]


In [57]:
test_pred_df2.to_csv('seniority_labelled_test_set_gpt_3_5_preds_few_shot4.csv', index=False)

Testing 

In [5]:
import pandas as pd


df_true = pd.read_csv("seniority_labelled_test_set_ans.csv")     
df_pred = pd.read_csv("seniority_labelled_test_set_gpt_3_5_preds_few_shot.csv")    



df = pd.concat([df_true.reset_index(drop=True), 
                df_pred.reset_index(drop=True)], 
               axis=1)


matches   = (df["y_true"] == df["y_pred"]).sum()
total     = len(df)
mismatches = total - matches

print(f"Exact matches:   {matches} / {total}")
print(f"Non-matches:     {mismatches} / {total}")


mismatch_df = df[df["y_true"] != df["y_pred"]]


for idx, row in mismatch_df.iterrows():
    print(f"Row {idx}: true={row['y_true']}  pred={row['y_pred']}")

Exact matches:   329 / 689
Non-matches:     360 / 689
Row 0: true=senior  pred=experienced
Row 1: true=experienced  pred=senior
Row 2: true=entry-level  pred=assistant
Row 4: true=intermediate  pred=entry-level
Row 8: true=entry-level  pred=entry-level to intermediate
Row 11: true=experienced  pred=mid-level
Row 12: true=experienced  pred=standard
Row 13: true=senior  pred=entry-level
Row 15: true=experienced  pred=manager
Row 16: true=experienced  pred=intermediate
Row 17: true=intermediate  pred=entry-level
Row 22: true=experienced  pred=senior
Row 26: true=graduate  pred=graduate/junior
Row 30: true=head  pred=executive
Row 33: true=lead  pred=head
Row 34: true=experienced  pred=qualified
Row 35: true=entry-level  pred=independent
Row 39: true=intermediate  pred=head
Row 40: true=experienced  pred=intermediate
Row 41: true=entry-level  pred=entry-level to intermediate
Row 43: true=lead  pred=junior
Row 47: true=assistant  pred=junior
Row 49: true=intermediate  pred=student
Row 50: t

In [3]:
# Save only the rows where y_true != y_pred
mismatch_df.to_csv("seniority_mismatches_gpt.csv", columns=["y_true", "y_pred"], index=False)

In [8]:
df = pd.read_csv("seniority_mismatches_gpt.csv", header=None, names=['y_true','y_pred'])


def map_custom(label):
    lab = str(label).lower()
    if any(x in lab for x in ['entry', 'graduate', 'trainee', 'cadet', 'apprentice']):
        return 'Entry'
    if any(x in lab for x in ['assistant', 'junior', 'student']):
        return 'Early Career'
    if any(x in lab for x in ['intermediate', 'experienced', 'mid']):
        return 'Mid Career'
    if any(x in lab for x in ['senior', 'lead', 'principal', 'specialist']):
        return 'Senior'
    if any(x in lab for x in ['manager', 'director', 'executive', 'chief', 'head', 'owner-operator', 'contract', 'contractor', 'coordinator', 'business leader', 'team leader', 'board']):
        return 'Management / Exec'
    return 'Other'

# Apply mapping and compute matches
df['true_cat'] = df['y_true'].apply(map_custom)
df['pred_cat'] = df['y_pred'].apply(map_custom)
df['match'] = df['true_cat'] == df['pred_cat']

matches = df['match'].sum()
total = len(df)
non_matches = total - matches
overall_pct = matches / total * 100

print("Matches based on these new categories:",overall_pct)

Matches based on these new categories: 29.362880886426595
