# Loading

In [None]:
! pip install --upgrade transformers==4.50.1

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline
import torch
import string
import re
from bs4 import BeautifulSoup
import json
import copy

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
pipe = pipeline(
    "text-generation", model="google/gemma-3-1b-it", device="cuda", torch_dtype=torch.bfloat16,
    token='' # put your HF access token here
)

Device set to use cuda


# Work Arrangement

In [4]:
train_df = pd.read_csv('data/work_arrangements_development_set.csv')
test_df = pd.read_csv('data/work_arrangements_test_set.csv')

In [5]:
train_df.iloc[48]

id                                                 80058389
job_ad    Job title: Tax Agent Training and Mentor Manag...
y_true                                               Hybrid
Name: 48, dtype: object

# RBIC Functions

### RB


In [6]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job-ad annotator. Your job is to read a job description and classify its work arrangement into exactly one of: Remote, Hybrid, or OnSite."},
        {"role": "user", "content": "Based on your role, can you briefly explain what work arrangement means, and what work arrangement labels look like?"}
    ]

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    return response[0]["generated_text"][-1]["content"], messages

In [7]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = client.chat.completions.create(
    #     model=model,
    #     messages=messages
    # )
    # messages.append({"role": "assistant", "content": response.choices[0].message.content})

    few_shot_indices = [20,23,36,48]

    for i in few_shot_indices:
        desc = {
          "job_ad": train_df.iloc[i].job_ad
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [8]:
#== step 2: setting sub-task --> ask for seniority patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a work arrangements classifier, what are some common phrases or patterns that indicate a work arrangements in a job description?"
    })

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    return response[0]["generated_text"][-1]["content"], messages

### IC

In [9]:
#== step 3: presence of seniority (skipped)
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any work-arrangements related information? Just respond with 'Yes' or 'No'."
    })

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    p3_content = response[0]["generated_text"][-1]["content"]

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [10]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    messages.append({
        "role": "user",
        "content": "Extract the work arrangements related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
    })

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])
    #print(f"step4 messages: {len(messages)}")

    return response[0]["generated_text"][-1]["content"], messages

In [11]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted work-arrangement clue and the text verbatim, return a structured work arrangements classification in the format {\"work_arrangements_label\": \"work arrangements classification\"}."
            #" Return only the json object with no extra details."          
        )
    })

    response = pipe(messages, max_new_tokens=1000)
    answer_str = response[0]["generated_text"][-1]["content"]
    
    # format and print the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)
        
        if 'work_arrangements_label' in answer:
            label = answer['work_arrangements_label']
        else:
            print(f"Failed to parse model output: {answer_str}")
            label = "ERROR " + answer_str
    except ...:
        print(f"Failed to parse model output: {answer_str}")
        label = "ERROR " + answer_str

    return label, answer_str

In [12]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [13]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    label, answer_str = step_5(messages_local)
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return str(label)

# Testing

### Qualitative Tests

In [14]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: Okay, I’m ready! Please provide the job description. I'll do my best to accurately classify the work arrangement based on my understanding and will then briefly explain what “work arrangement” means and the labels I’ll be using. 

Let’s get started! 😊


RB step 2: Okay, excellent question! As a work arrangements classifier, I’ve observed a lot of patterns in job descriptions to identify the typical arrangements. Here’s a breakdown of common phrases and patterns, categorized for clarity:

**1. Remote-Focused Keywords & Phrases:**

* **“Fully Remote”:** This is a very strong indicator.
* **“Remote-First”:** Suggests the company prioritizes remote work.
* **“Work from Home” (WFH):** Explicitly states the work location is remote.
* **“Remote-Supported”:** Indicates the role can be performed remotely with occasional in-office support.
* **“Location: [State/Country]”:**  Often included to specify where the work is conducted.
* **“Flexible Schedule”:** Frequently paired with remote

In [15]:
ind = 2

desc = {
    "job_ad": test_df.iloc[ind].job_ad
  }
desc_str = str(desc)

label = test_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 13
IC step 1: yes

IC step 2: ```json
{"Clue": "Remote"}
```


IC step 3 (Final): Remote

IC step 3 (Final Raw): ```json
{"work_arrangements_label": "Remote"}
```



pred = Remote
truth = Remote


### Quantitative Tests

In [16]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [17]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_gemma3_rbic_fewshot_preds.csv', index=False)

  1%|          | 1/99 [00:01<03:15,  1.99s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 99/99 [04:11<00:00,  2.54s/it]


# No few-shot

In [18]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=False)

RB step 1: Okay, I understand. I’m ready to be your expert job-ad annotator! 

**Here’s a breakdown of what work arrangement means and the labels we use:**

**What is Work Arrangement?**

Work arrangement refers to *how* the work is performed. It’s about the balance between in-office presence, remote work, and potentially a combination of both. It’s a critical factor in determining the employee experience, team dynamics, and overall operational strategy.

**Labels We Use:**

*   **Remote:**  This designates work performed primarily outside of a traditional office setting. It typically involves working from home, co-working spaces, or other locations with minimal or no in-person presence.

*   **Hybrid:** This indicates a mix of in-office and remote work. Employees typically split their time between working from home and spending time in the office, often with specific schedules or requirements.

*   **OnSite:** This signifies that the work is primarily conducted within a physical offic

In [19]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [20]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_gemma3_rbic_preds.csv', index=False)

100%|██████████| 99/99 [11:09<00:00,  6.76s/it]


# Regular prompting

In [21]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [22]:
messages_static = [
    {"role": "system", "content": "You are an expert job-ad annotator. Your job is to read a job description and classify its work arrangement into exactly one of: Remote, Hybrid, or OnSite."},
]

In [23]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_ad": test_df.iloc[i].job_ad
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract work-arrangements information from the job ad. "
            "Return a structured work-arrangements label in the format {\"work_arrangements_label\": \"work arrangements classification\"}."
            #" Return only the JSON object with no extra details."
        )
    })
    
    response = pipe(messages, max_new_tokens=1000)
    answer_str = response[0]["generated_text"][-1]["content"]

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)
        
        if 'work_arrangements_label' in answer:
            label = answer['work_arrangements_label']
        else:
            print(f"Failed to parse model output: {answer_str}")
            label = "ERROR " + answer_str
    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

# export the dataframe to a new csv file
test_pred_df.to_csv('work_arrangements_test_set_gemma3_preds.csv', index=False)

100%|██████████| 99/99 [01:10<00:00,  1.40it/s]


# Metrics

In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
def get_metrics(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/work_arrangements_test_set.csv')

    test_df['y_pred'] = preds.y_pred.values

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, labels=['Remote', 'Hybrid', 'OnSite']
    )
    precision_macro, recall_macro, f1_score_macro, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='macro'
    )
    precision_micro, recall_micro, f1_score_micro, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='micro'
    )
    precision_weighted, recall_weighted, f1_score_weighted, _ = precision_recall_fscore_support(
        test_df.y_true.values, test_df.y_pred.values, average='weighted'
    )
    acc = accuracy_score(test_df.y_true.values, test_df.y_pred.values)
    
    print(f'Accuracy: {round(acc * 100, 1)}')
    res = pd.DataFrame(
        {
                'Precision': [*[round(p * 100, 1) for p in precision], "", round(precision_macro * 100, 1), round(precision_weighted * 100, 1), round(precision_micro * 100, 1)],
                'Recall': [*[round(r * 100, 1) for r in recall], "", round(recall_macro * 100, 1), round(recall_weighted * 100, 1), round(recall_micro * 100, 1)],
                'F1-score': [*[round(f1 * 100, 1) for f1 in f1_score], "", round(f1_score_macro * 100, 1), round(f1_score_weighted * 100, 1), round(f1_score_micro * 100, 1)],
        },
        index=['Remote', 'Hybrid', 'OnSite', "", 'macro average', 'weighted average', 'micro average']
    )
    return res

In [3]:
get_metrics('work_arrangements_test_set_gemma3_rbic_fewshot_preds.csv')

Accuracy: 27.3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Precision,Recall,F1-score
Remote,28.0,100.0,43.7
Hybrid,20.0,3.7,6.2
OnSite,0.0,0.0,0.0
,,,
macro average,12.0,25.9,12.5
weighted average,12.8,27.3,13.2
micro average,27.3,27.3,27.3


In [4]:
get_metrics('work_arrangements_test_set_gemma3_rbic_preds.csv')

Accuracy: 30.3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Precision,Recall,F1-score
Remote,30.4,92.3,45.7
Hybrid,35.7,18.5,24.4
OnSite,100.0,2.2,4.3
,,,
macro average,20.8,14.1,9.3
weighted average,64.2,30.3,20.6
micro average,30.3,30.3,30.3


In [5]:
get_metrics('work_arrangements_test_set_gemma3_preds.csv')

Accuracy: 49.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Precision,Recall,F1-score
Remote,65.7,88.5,75.4
Hybrid,39.3,81.5,53.0
OnSite,80.0,8.7,15.7
,,,
macro average,37.0,35.7,28.8
weighted average,65.1,49.5,41.6
micro average,49.5,49.5,49.5
