Import data from Google Drive

In [7]:
import os
from google.colab import drive

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/AB_job_data_files'

import os
from google.colab import drive

# find location of different files:
file_location = {}
# Check if the folder exists
if os.path.exists(folder_path):
  os.chdir(folder_path)
  for num, f in enumerate(os.listdir()):  # List files and directories in the current folder
    file_location[f] = os.path.join(folder_path, f)

else:
  print(f"Folder not found: {folder_path}")

print(file_location)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'seniority_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_test_set.csv', 'unlabelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/unlabelled_development_set.csv', 'salary_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_development_set.csv', 'seniority_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_development_set.csv', 'work_arrangements_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_test_set.csv', 'salary_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_test_set.csv', 'work_arrangements_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_development_set.csv', 'results': '/content/drive/MyDrive/AB_job_data_files/results', 'wandb

Create a function to extract salary

In [8]:
import spacy
import re

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_salary(text):
    """
    Extracts salary information from text using spaCy and regular expressions.

    Args:
        text: The input text string.

    Returns:
        A list of strings, each representing a detected salary in the format:
        minsalary-maxsalary-currency-frequency
    """
    doc = nlp(text)
    salaries_higher_confidence = []
    salaries_lower_confidence = []

    currency_pattern = r"\$|£|€|USD|GBP|EUR|AUD|MYR|SGD|PHP|NZD|k|K"
    range_pattern = r"(?:between|from)\s*[\d,.]+[kK]?\s*(?:(?:-|to)\s*[\d,.]+[kK]?)?|\d{1,3}(?:,\d{3})*(?:\.\d+)?[kK]?"

    # Two methods will be used to look for salary:

    # Method 1: Use inbuilt functions to look for MONEY and CARDINAL labels.
    for ent in doc.ents:

        # If any money is detected then add it to salary
        if ent.label_ in ["MONEY"]:
            salaries_higher_confidence.append(ent.text)

        # If any number is detected then to improve the accuracy, a second pass is added.
        elif ent.label_ in ["CARDINAL"]:
            if re.search(currency_pattern, ent.text) or re.search(range_pattern, ent.text):
                salaries_higher_confidence.append(ent.text)

    # Method 2: Check for potential salaries outside entities
    for token in doc:
        if token.pos_ in ["NUM", "SYM"] and re.search(currency_pattern, token.text):  # Check for currency symbols
            salaries_lower_confidence.append(token.text)
        elif token.pos_ == "NUM" and re.search(range_pattern, token.text):  # Check for numerical ranges
            salaries_lower_confidence.append(token.text)
        elif token.dep_ in ["nummod", "nmod"] and token.head.text.lower() in ["salary", "wage", "pay"]:
            # Check for numerical modifiers of salary-related words
            potential_salary = token.text
            if token.head.text.lower() in ["salary", "wage", "pay"]:
                salaries_lower_confidence.append(potential_salary)
        # Check for numbers ending with "K"
        elif token.pos_ == "NUM" and (token.text.endswith("k") or token.text.endswith("K")):
            salaries_lower_confidence.append(token.text)

    # 1: change k|K to 000
    salaries_higher_confidence = [salary.replace('k', '000').replace('K', '000') for salary in salaries_higher_confidence if salary]
    salaries_lower_confidence = [salary.replace('k', '000').replace('K', '000') for salary in salaries_lower_confidence if salary]

    # 2: Remove non-numeric characters (except comma, hyphen, "to", and "between")
    updated_salaries = []
    for salary in salaries_lower_confidence:
        modified_salary = re.sub(r"[^0-9,\-tobetween]", "", salary)  # Keep commas, hyphens, "to", and "between"
        updated_salaries.append(modified_salary)
    salaries_lower_confidence = updated_salaries

    updated_salaries = []
    for salary in salaries_higher_confidence:
        modified_salary = re.sub(r"[^0-9\-tobetween]", "", salary)  # Keep commas, hyphens, "to", and "between"
        updated_salaries.append(modified_salary)
    salaries_higher_confidence = updated_salaries

    # select appropriate salary
    selected_salary = None

    # Step 1: Check for range formats in higher confidence salaries
    if any(any(keyword in salary for keyword in ['-', 'to', 'between']) for salary in salaries_higher_confidence):
        selected_salary = next((salary for salary in salaries_higher_confidence if any(keyword in salary for keyword in ['-', 'to', 'between'])), None)
        selected_salary = f"{selected_salary}"

    # Step 2: If there are any numeric salaries in higher confidence
    elif salaries_higher_confidence:
        try:
            numeric_salaries = [int(s.replace(',', '')) for s in salaries_higher_confidence if s.replace(',', '').replace('.', '').isdigit()]
            if numeric_salaries:
                selected_salary = f"{min(numeric_salaries)}-{max(numeric_salaries)}"
        except ValueError:
            pass

    # Step 3: If there are any numeric salaries in lower confidence
    elif salaries_lower_confidence:
        try:
            numeric_salaries = [int(s.replace(',', '')) for s in salaries_lower_confidence if s.replace(',', '').replace('.', '').isdigit()]
            if numeric_salaries:
                selected_salary = f"{min(numeric_salaries)}-{max(numeric_salaries)}"
        except ValueError:
            pass

    # Step 4: Default fallback
    if not selected_salary:
        return "0-0-None-None"


    # Extract min and max salary (if range is present)
    parts = re.split(r"-|to_between", selected_salary)  # Split by "-" or "to"
    min_salary = parts[0].strip()
    max_salary = parts[1].strip() if len(parts) > 1 else parts[0].strip()  # If no max, use min

    # Extract currency (using regex) - assumption order: $ £ €
    currency_match = re.search(r"\$|£|€|USD|GBP|EUR|AUD|MYR|SGD|PHP|NZD", salary)
    currency = currency_match.group(0) if currency_match else "USD"  # Default to USD

    # Extract frequency (looking for keywords)
    frequency = "YEARLY"  # Default to yearly
    if "month" in salary.lower() or "monthly" in salary.lower():
        frequency = "MONTHLY"
    elif "hour" in salary.lower() or "hourly" in salary.lower():
        frequency = "HOURLY"

    # Format the output string
    formatted_salary = f"{min_salary}-{max_salary}-{currency}-{frequency}"

    # print('formatted salaries', formatted_salary)
    return formatted_salary



Test example

In [9]:
# Example usage

text = """<div><div><div>

 Job Opening

 <p>
 Financial Account - Call Center Agent - Up to $34k
 </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Job Industry


 <p>
 Telecommunications </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Job Type

 <p>
 Full-Time </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Experience Level


 <p>
 Entry Level </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Date Posted

 <p>
 2022-10-27 </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Job Location

 <p>
 Pasig BlvdPasig1000NCRPhilippines </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Company Information

 <p>
 Sapient

 Pasig Blvd
 Cebu, Central Visayas
 6019
 Sapient is Philippine-based BPO that provides a range of outsourcing services from consulting services, IT-enabled services, and call center services primarily catering small and medium based enterprises.
 </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Job Description

 <div>
<p>Job Responsibilities:</p>
<ul>
<li>Answers phone calls and provides important information/ assistance to clients</li>
<li>Checks mail, fax and internet mail to provide customer assistance</li>
<li>Communicates with customer on the phone or using written correspondence to take care of concerns</li>
<li>Answer participant questions, , as well as talk to participants to achieve full understanding of what critical information are being asked.</li>
</ul>
</div>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Job Qualifications

 <div>
<p>What are we looking for? Â  Â  Â  Â </p>
<ul>
<li>Open to candidates who completed college</li>
<li>Open to</li>
</ul>
<p>High School and Senior High School Graduates with BPO experience</p>
<ul>
<li>Excellent to above average English communication skills</li>
<li>BPO experience of at least 6 months or have work experience</li>
<li>Can do onsite work</li>
<li>With in 25km to 35 km</li>
</ul>
</div>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Compensation

 <p>
 17500 </p>
 </div></div></div>
<div><div></div></div>
<div><div><div>

 Compensation Range

 <p>
 â‚±15,000 - â‚±20,000 </p>
 </div></div></div>
<div><div></div></div>
<div><div></div></div>
<div><div></div></div>"""

salary_info = extract_salary(text)

print(salary_info)

15000-20000-USD-YEARLY


Testing on Development Set for salary extraction

In [10]:
# DEV set
import pandas as pd
salary_dev = pd.read_csv(file_location['salary_labelled_development_set.csv'])

results = 0
count = 0

for index, row in salary_dev.iterrows(): # Iterate through the rows of the dataframe
    predicted_label = extract_salary(row['job_ad_details'])
    true_label =  row['y_true']

    print(predicted_label, '----', true_label)
    print(predicted_label == true_label)
    if predicted_label == true_label:
        results += 1
    count += 1

print("count:", count)
print("result:", results)
print("correctly indentified:", results/count)


17500-34000-USD-YEARLY ---- 17500-17500-PHP-MONTHLY
False
5-10-USD-YEARLY ---- 16000-16000-PHP-MONTHLY
False
200-200-USD-YEARLY ---- 0-0-None-None
False
1-5-USD-YEARLY ---- 0-0-None-None
False
0-0-None-None ---- 0-0-None-None
True
1-8621-USD-YEARLY ---- 50-60-HKD-HOURLY
False
3-3-USD-YEARLY ---- 0-0-None-None
False
20000-20000-USD-YEARLY ---- 16000-16000-PHP-MONTHLY
False
2-20000-USD-YEARLY ---- 17500-17500-PHP-MONTHLY
False
6-3200-USD-YEARLY ---- 32-32-NZD-HOURLY
False
3200tonton-3200tonton-USD-YEARLY ---- 2800-3200-MYR-MONTHLY
False
13-23-USD-YEARLY ---- 65-65-HKD-HOURLY
False
28-30-$-YEARLY ---- 28-30-NZD-HOURLY
False
700-700-USD-YEARLY ---- 0-0-None-None
False
8-8-USD-YEARLY ---- 0-0-None-None
False
3-3-USD-YEARLY ---- 0-0-None-None
False
2023-2023-USD-YEARLY ---- 35-35-AUD-HOURLY
False
6000-6000-USD-YEARLY ---- 6000-6000-MYR-MONTHLY
False
35-24-USD-YEARLY ---- 65-75-HKD-HOURLY
False
8426-8426-USD-YEARLY ---- 0-0-None-None
False
30-2022-USD-YEARLY ---- 0-0-None-None
False
2-100-USD

Testing on Test set for salary extraction

In [11]:
# TEST set
import pandas as pd
salary_test = pd.read_csv(file_location['salary_labelled_test_set.csv'])

results = 0
count = 0


for index, row in salary_test.iterrows(): # Iterate through the rows of the dataframe
    predicted_label = extract_salary(row['job_ad_details'])
    true_label =  row['y_true']

    #print(predicted_label, '----', true_label)
    #print(predicted_label == true_label)
    if predicted_label == true_label:
        results += 1
    count += 1

print("count:", count)
print("result:", results)
print("correctly indentified:", results/count)


count: 567
result: 25
correctly indentified: 0.04409171075837742
