# Loading

In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import openai
import string
import json
import copy

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
client = OpenAI(api_key="", base_url="https://api.deepseek.com")
model="deepseek-chat" # DeepSeek-V3

# Work Arrangement

In [3]:
train_df = pd.read_csv('data/work_arrangements_development_set.csv')
test_df = pd.read_csv('data/work_arrangements_test_set.csv')

In [4]:
train_df.iloc[48]

id                                                 80058389
job_ad    Job title: Tax Agent Training and Mentor Manag...
y_true                                               Hybrid
Name: 48, dtype: object

# RBIC Functions

### RB


In [5]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job-ad annotator. Your job is to read a job description and classify its work arrangement into exactly one of: Remote, Hybrid, or OnSite."},
        {"role": "user", "content": "Based on your role, can you briefly explain what work arrangement means, and what work arrangement labels look like?"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

In [6]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = client.chat.completions.create(
    #     model=model,
    #     messages=messages
    # )
    # messages.append({"role": "assistant", "content": response.choices[0].message.content})

    few_shot_indices = [20,23,36,48]

    for i in few_shot_indices:
        desc = {
          "job_ad": train_df.iloc[i].job_ad
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [7]:
#== step 2: setting sub-task --> ask for seniority patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a work arrangements classifier, what are some common phrases or patterns that indicate a work arrangements in a job description?"
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

### IC

In [8]:
#== step 3: presence of seniority (skipped)
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any work-arrangements related information? Just respond with 'Yes' or 'No'."
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    p3_content = response.choices[0].message.content

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [9]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    messages.append({
        "role": "user",
        "content": "Extract the work arrangements related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})
    #print(f"step4 messages: {len(messages)}")

    return response.choices[0].message.content, messages

In [10]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted work-arrangement clue and the text verbatim, return a structured work arrangements classification in the format {\"work_arrangements_label\": \"work arrangements classification\"}."
            " Return only the json object with no extra details."          
        )
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content
    
    # format and print the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)
        
        label = answer['work_arrangements_label']
    except ...:
        print(f"Failed to parse model output: {answer_str}")
        label = "ERROR " + answer_str

    return label, answer_str

In [11]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [12]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    label, answer_str = step_5(messages_local)
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return str(label)

# Testing

### Qualitative Tests

In [13]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: Certainly!  

### **Work Arrangement Meaning**  
A **work arrangement** describes where an employee is expected to perform their job duties. It falls into one of three main categories:  
1. **Remote** – The job can be done entirely from anywhere, with no requirement to work from an office.  
2. **Hybrid** – The job requires a mix of remote and in-office work (e.g., 2 days in-office, 3 days remote).  
3. **OnSite** – The job requires working full-time at a physical location (e.g., office, factory, store).  

### **What Work Arrangement Labels Look Like**  
Job postings may explicitly state the arrangement (e.g., "fully remote," "hybrid role," "on-site position"). Sometimes, it's implied through phrases like:  
- **Remote**: "Work from anywhere," "100% remote," "no location restrictions."  
- **Hybrid**: "Flexible work model," "partially remote," "must be in office X days a week."  
- **OnSite**: "Must work at [location]," "not a remote role," "in-office presence required."  


In [14]:
ind = 2

desc = {
    "job_ad": test_df.iloc[ind].job_ad
  }
desc_str = str(desc)

label = test_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 13
IC step 1: yes

IC step 2: ```json
{
  "Clue": "Permanent work from home, Complete WFH set-up, Flexible working environment"
}
```

IC step 3 (Final): Remote

IC step 3 (Final Raw): ```json
{
  "work_arrangements_label": "Remote"
}
```

pred = Remote
truth = Remote


### Quantitative Tests

In [15]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [16]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_deepseek_rbic_fewshot_preds.csv', index=False)

100%|███████████████████████████████████████████| 99/99 [36:27<00:00, 22.10s/it]


# No few-shot

In [17]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=False)

RB step 1: Certainly!  

**Work Arrangement** refers to how and where an employee is expected to perform their job duties. The three main classifications are:  

1. **Remote** – The job can be performed entirely from a location outside a traditional office (e.g., home or another remote workspace). No on-site presence is required.  
2. **Hybrid** – The job requires a mix of remote and on-site work (e.g., a few days in the office per week/month).  
3. **OnSite** – The job requires the employee to work primarily or exclusively at a physical workplace (e.g., office, factory, or client location).  

### **What Work Arrangement Labels Look Like**  
Job descriptions may explicitly state the arrangement (e.g., "This is a fully remote position") or imply it through phrases like:  
- **Remote**: "Work from anywhere," "100% remote," "No office requirement."  
- **Hybrid**: "Flexible work model," "Partially remote," "3 days in-office, 2 remote."  
- **OnSite**: "Must work in [location]," "Office-b

In [18]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [19]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_deepseek_rbic_preds.csv', index=False)

100%|███████████████████████████████████████████| 99/99 [23:42<00:00, 14.37s/it]


# Regular prompting

In [20]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [21]:
messages_static = [
    {"role": "system", "content": "You are an expert job-ad annotator. Your job is to read a job description and classify its work arrangement into exactly one of: Remote, Hybrid, or OnSite."},
]

In [22]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract work-arrangements information from the job ad. "
            "Return a structured work-arrangements label in the format {\"work_arrangements_label\": \"\"}."
            " Return only the JSON object with no extra details."
        )
    })
    
    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)
        
        label = answer['work_arrangements_label']
    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_deepseek_preds.csv', index=False)

100%|███████████████████████████████████████████| 99/99 [07:25<00:00,  4.50s/it]


# Metrics

In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
def get_metrics(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/work_arrangements_test_set.csv')

    test_df['y_pred'] = preds.y_pred.values

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, labels=['Remote', 'Hybrid', 'OnSite']
    )
    precision_macro, recall_macro, f1_score_macro, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='macro'
    )
    precision_micro, recall_micro, f1_score_micro, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='micro'
    )
    precision_weighted, recall_weighted, f1_score_weighted, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='weighted'
    )
    acc = accuracy_score(test_df.y_true.values, test_df.y_pred.values)
    
    print(f'Accuracy: {round(acc * 100, 1)}')
    res = pd.DataFrame(
        {
                'Precision': [*[round(p * 100, 1) for p in precision], "", round(precision_macro * 100, 1), round(precision_weighted * 100, 1), round(precision_micro * 100, 1)],
                'Recall': [*[round(r * 100, 1) for r in recall], "", round(recall_macro * 100, 1), round(recall_weighted * 100, 1), round(recall_micro * 100, 1)],
                'F1-score': [*[round(f1 * 100, 1) for f1 in f1_score], "", round(f1_score_macro * 100, 1), round(f1_score_weighted * 100, 1), round(f1_score_micro * 100, 1)],
        },
        index=['Remote', 'Hybrid', 'OnSite', "", 'macro average', 'weighted average', 'micro average']
    )
    return res

In [3]:
get_metrics('work_arrangements_test_set_deepseek_rbic_fewshot_preds.csv')

Accuracy: 91.9


Unnamed: 0,Precision,Recall,F1-score
Remote,92.9,100.0,96.3
Hybrid,82.8,88.9,85.7
OnSite,97.6,89.1,93.2
,,,
macro average,91.1,92.7,91.7
weighted average,92.3,91.9,92.0
micro average,91.9,91.9,91.9


In [4]:
get_metrics('work_arrangements_test_set_deepseek_rbic_preds.csv')

Accuracy: 90.9


Unnamed: 0,Precision,Recall,F1-score
Remote,100.0,96.2,98.0
Hybrid,76.5,96.3,85.2
OnSite,97.5,84.8,90.7
,,,
macro average,91.3,92.4,91.3
weighted average,92.4,90.9,91.1
micro average,90.9,90.9,90.9


In [5]:
get_metrics('work_arrangements_test_set_deepseek_preds.csv')

Accuracy: 96.0


Unnamed: 0,Precision,Recall,F1-score
Remote,92.9,100.0,96.3
Hybrid,96.0,88.9,92.3
OnSite,97.8,97.8,97.8
,,,
macro average,95.6,95.6,95.5
weighted average,96.0,96.0,95.9
micro average,96.0,96.0,96.0
