In [21]:
import pandas as pd
from bs4 import BeautifulSoup
import re


In [22]:
# Read data
data = pd.read_csv('../DATASETS/salary_labelled_development_set.csv')

# Mapping from nation codes to currency codes
nation_currency = {
    "PH": "PHP",   # Philippines -> Philippine Peso
    "NZ": "NZD",   # New Zealand -> New Zealand Dollar
    "AUS": "AUD",  # Australia -> Australian Dollar
    "HK": "HKD",   # Hong Kong -> Hong Kong Dollar
    "ID": "IDR",   # Indonesia -> Indonesian Rupiah
    "MY": "MYR",   # Malaysia -> Malaysian Ringgit
    "SG": "SGD",   # Singapore -> Singapore Dollar
    "TH": "THB"    # Thailand -> Thai Baht
}

In [23]:
salary_info = data.iloc[:, 2]
data['currency'] = data.iloc[:, 3].map(nation_currency)
actual_info_series = data.iloc[:, 5]

In [24]:
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text()


In [25]:
def detect_unit_extended(window_text):
    """
    Detect the unit information within the given text window.
    Supports five cases: hourly, daily, weekly, monthly, and annual.
    The function searches for all possible matches using regular expressions
    and returns the unit corresponding to the earliest match.
    If no match is found, it defaults to "monthly".
    """
    unit_patterns = {
        "hourly": r'(per\s+hour|hourly|時薪|每小時|每小時薪資)',
        "daily":  r'(per\s+day|daily|日薪|每天|每日薪資)',
        "weekly": r'(per\s+week|weekly|週薪|每週|周薪|每周薪資)',
        "monthly": r'(per\s+month|monthly|月薪|每月|每月薪資)',
        "annual": r'(per\s+year|yearly|annually|remuneration|super|年薪|每年|每年薪資|年度薪資)'
    }
    matches = []
    for unit, pattern in unit_patterns.items():
        m = re.search(pattern, window_text, re.IGNORECASE)
        if m:
            matches.append((m.start(), unit))
    if matches:
        matches.sort(key=lambda x: x[0])
        return matches[0][1]
    else:
        return "monthly"  # Default unit

In [26]:
def extract_salary_with_unit(text):
    """
    Extract salary values and the unit from the given text.
    - If a salary range is found (e.g., "$50 - $60" or "50 to 60"), return (min, max, unit);
    - If only one value is found, return (value, value, unit);
    - If extraction fails, return None.
    """
    clean_text = clean_html_tags(text)
    tokens = clean_text.split()

    # Iterate over tokens to find salary-related keywords
    for i, token in enumerate(tokens):
        token_lower = token.lower().strip(":")
        if (token_lower in ['待遇', 'salary', 'wage', 'compensation', 'remuneration']
                or "薪" in token or "$" in token or "¥" in token or "₱" in token):
            # Define a window (using the next 6 tokens as context)
            end = min(i + 6, len(tokens))
            window = tokens[i:end]
            window_text = " ".join(window)
            # Check for a salary range pattern (e.g., "$50 - $60" or "50 to 60")
            range_match = re.search(r'[¥$₱]?\s*(\d+(?:,\d+)*(?:\.\d+)?)\s*(?:to|-)\s*[¥$₱]?\s*(\d+(?:,\d+)*(?:\.\d+)?)', window_text)
            if range_match:
                value1 = float(range_match.group(1).replace(',', ''))
                value2 = float(range_match.group(2).replace(',', ''))
                unit = detect_unit_extended(window_text)
                return (value1, value2, unit)
            # Otherwise, check for a single value pattern
            for word in window:
                word_clean = word.strip(",.:;!()")
                word_clean = re.sub(r'^[¥$₱]', '', word_clean)
                match = re.search(r'\d+(?:,\d+)*(?:\.\d+)?', word_clean)
                if match:
                    value = float(match.group(0).replace(',', ''))
                    unit = detect_unit_extended(window_text)
                    return (value, value, unit)
    return None

In [27]:
# Function to parse the actual information string
def parse_actual_info(info_str):
    """
    Parse the actual information string (format: "min-max-currency-unit")
    and return a tuple (min, max, currency, unit).
    """
    try:
        parts = info_str.split('-')
        if len(parts) != 4 or parts == ['0', '0', 'None', 'None']:
            return None
        actual_min = float(parts[0])
        actual_max = float(parts[1])
        actual_currency = parts[2]
        actual_unit = parts[3].lower()  # Convert to lowercase for comparison
        return (actual_min, actual_max, actual_currency, actual_unit)
    except Exception as e:
        return None


In [28]:
total_count = 0        # Total number of samples with actual labels
correct_count = 0      # Count of exact matches

# Process each row
for idx, row in data.iterrows():
    raw_text = row.iloc[2]  # Column containing the raw salary information
    predicted = extract_salary_with_unit(raw_text)  # Returns (min, max, unit) or None

    # If a value is successfully extracted, combine it with the mapped currency to form the prediction result
    if predicted is not None:
        pred_min, pred_max, pred_unit = predicted
        # Get the predicted currency from the existing 'currency' column
        pred_currency = row['currency']
        predicted = (pred_min, pred_max, pred_currency, pred_unit.lower())

    # Parse the actual information
    actual_info = row.iloc[5]  # Assume the sixth column contains the actual information
    actual = parse_actual_info(actual_info)

    # Increase total count for every sample with an actual label
    total_count += 1
    if actual is not None:
        # If a prediction exists and all four elements match exactly, count it as correct
        if predicted is not None:
            if predicted == actual:
                correct_count += 1
                print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {actual}")
            else:
                print(f"[{idx}] ✅ Matched:   {predicted}")
    else:
        if predicted is None:
            correct_count += 1
            print(f"[{idx}] ✅ Matched:   {predicted}")
        else:
            print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {actual}")

accuracy = correct_count / total_count if total_count > 0 else None

print("Total samples with actual info:", total_count)
print("Number of correct predictions:", correct_count)
print("Accuracy:", accuracy)


[0] ❌ Predicted: (17500.0, 17500.0, 'PHP', 'monthly') | Expected: (17500.0, 17500.0, 'PHP', 'monthly')
[1] ❌ Predicted: (16000.0, 16000.0, 'PHP', 'monthly') | Expected: (16000.0, 16000.0, 'PHP', 'monthly')
[2] ✅ Matched:   None
[3] ✅ Matched:   None
[4] ✅ Matched:   None
[5] ❌ Predicted: (50.0, 60.0, 'HKD', 'hourly') | Expected: (50.0, 60.0, 'HKD', 'hourly')
[6] ✅ Matched:   None
[7] ❌ Predicted: (16000.0, 16000.0, 'PHP', 'monthly') | Expected: (16000.0, 16000.0, 'PHP', 'monthly')
[8] ❌ Predicted: (17500.0, 17500.0, 'PHP', 'monthly') | Expected: (17500.0, 17500.0, 'PHP', 'monthly')
[9] ❌ Predicted: (32.0, 32.0, 'NZD', 'hourly') | Expected: (32.0, 32.0, 'NZD', 'hourly')
[11] ❌ Predicted: (65.0, 65.0, 'HKD', 'hourly') | Expected: (65.0, 65.0, 'HKD', 'hourly')
[12] ✅ Matched:   (28.0, 30.0, 'NZD', 'monthly')
[13] ✅ Matched:   None
[14] ✅ Matched:   None
[15] ✅ Matched:   None
[16] ✅ Matched:   (35.0, 35.0, 'AUD', 'annual')
[17] ❌ Predicted: (6000.0, 6000.0, 'MYR', 'monthly') | Expected: (