Function to extract work arrangement from text

In [None]:
import re
import random

def preprocess(text):
    """Lowercase and remove unnecessary punctuation"""
    return re.sub(r'[^\w\s-]', '', text.lower())

def count_keywords(text, keywords):
    """Count how many times any keyword appears in the text."""
    return sum(len(re.findall(rf'\b{re.escape(k)}\b', text)) for k in keywords)

def classify_work_arrangement(job_ad_text):
    # Define keyword lists
    remote_keywords = ['remote', 'work from home', 'telecommute', 'telework']
    hybrid_keywords = ['hybrid', 'partially remote', 'mix of remote and on-site']
    onsite_keywords = ['on-site', 'on site', 'in person', 'in-person', 'office-based']

    # Preprocess text
    text = preprocess(job_ad_text)

    # Count keyword matches
    remote_count = count_keywords(text, remote_keywords)
    hybrid_count = count_keywords(text, hybrid_keywords)
    onsite_count = count_keywords(text, onsite_keywords)

    # Map labels to counts
    label_counts = {
        'remote': remote_count,
        'hybrid': hybrid_count,
        'OnSite': onsite_count
    }

    # Get max count
    max_count = max(label_counts.values())

    # Find all labels with the max count
    top_labels = [label for label, count in label_counts.items() if count == max_count]

    # Choose randomly if tie or just return the top one
    return random.choice(top_labels)

Test example

In [None]:
text = "We offer a hybrid work environment. You may work remotely up to 3 days per week and are expected to be in-person for meetings."

print(classify_work_arrangement(text))


OnSite


Import data from Google Drive

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/AB_job_data_files'

import os
from google.colab import drive

# find location of different files:
file_location = {}
# Check if the folder exists
if os.path.exists(folder_path):
  os.chdir(folder_path)
  for num, f in enumerate(os.listdir()):  # List files and directories in the current folder
    file_location[f] = os.path.join(folder_path, f)

else:
  print(f"Folder not found: {folder_path}")

print(file_location)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'seniority_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_test_set.csv', 'unlabelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/unlabelled_development_set.csv', 'salary_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_development_set.csv', 'seniority_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_development_set.csv', 'work_arrangements_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_test_set.csv', 'salary_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_test_set.csv', 'work_arrangements_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_development_set.csv', 'results': '/content/drive/MyDrive/AB_job_data_files/results', 'wandb

Testing on Development Set for work arrangements

In [None]:
# DEV set
import pandas as pd
work_arrangement_dev = pd.read_csv(file_location['work_arrangements_development_set.csv'])

results = 0
count = 0
for index, row in work_arrangement_dev.iterrows(): # Iterate through the rows of the dataframe
    predicted_label = classify_work_arrangement(row['job_ad']).lower().strip()
    true_label =  row['y_true'].lower().strip()

    #print(predicted_label ,'---', true_label)
    #print(predicted_label == true_label)
    if predicted_label == true_label:
        results += 1
    count += 1

print("count:", count)
print("result:", results)
print("correctly indentified:", results/count)


count: 99
result: 54
correctly indentified: 0.5454545454545454


Testing on Test set for work arrangements

In [None]:
# TEST set
import pandas as pd
work_arrangement_test = pd.read_csv(file_location['work_arrangements_test_set.csv'])

results = 0
count = 0
for index, row in work_arrangement_test.iterrows(): # Iterate through the rows of the dataframe
    predicted_label = classify_work_arrangement(row['job_ad']).lower().strip()
    true_label =  row['y_true'].lower().strip()

    #print(predicted_label ,'---', true_label)
    #print(predicted_label == true_label)
    if predicted_label == true_label:
        results += 1
    count += 1

print("count:", count)
print("result:", results)
print("correctly indentified:", results/count)


count: 99
result: 48
correctly indentified: 0.48484848484848486
