In [32]:
import pandas as pd
import re
import numpy as np

# Load CSV
file_path = "/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv"
df = pd.read_csv(file_path)

# Country to currency mapping
country_currency_map = {
    "PH": "PHP", "AUS": "AUD", "NZ": "NZD", "SG": "SGD",
    "MY": "MYR", "TH": "THB", "ID": "IDR", "HK": "HKD"
}

# Normalize period labels
def normalize_period(period):
    replacements = {
        "HOUR": "HOURLY", "DAY": "DAILY", "WEEK": "WEEKLY",
        "MONTH": "MONTHLY", "YEAR": "YEARLY", "ANNUAL": "YEARLY"
    }
    return replacements.get(period.upper(), period)

# Infer period based on amount
def infer_period_by_amount(amount):
    amount = int(amount)
    if amount < 500:
        return "HOURLY"
    elif amount < 20000:
        return "MONTHLY"
    else:
        return "YEARLY"

# Expanded anchor keyword list
anchor_keywords = [
    "COMPENSATION", "SALARY", "REMUNERATION", "PACKAGE", "PAY", "WAGE",
    "EARNINGS", "BONUS", "INCOME", "HOURLY RATE", "MONTHLY RATE",
    "ÊôÇËñ™", "Ëñ™Èáë", "Ëñ™ÈÖ¨", "ÊúàËñ™", "ÂæÖÈÅá", "ÂÖºËÅ∑ÊôÇËñ™", "ÂÖ®ËÅ∑ÊúàËñ™"
]

def extract_salary_with_inference(text, nation_code):
    text = text.replace(",", "")
    currency = country_currency_map.get(nation_code, "None")

    anchor_positions = []
    for keyword in anchor_keywords:
        for match in re.finditer(re.escape(keyword), text, flags=re.IGNORECASE):
            anchor_positions.append(match.start())

    # Look near each anchor
    for pos in anchor_positions:
        window = text[pos:pos + 60]

        # Match salary range (includes -, ‚Äì, ‚Äî, TO)
        range_match = re.search(r'\$?\b(\d{2,6})\b\s*(?:[-‚Äì‚Äî]|TO)+\s*\$?\b(\d{2,6})\b', window)
        if range_match:
            min_salary, max_salary = range_match.group(1), range_match.group(2)
            if int(min_salary) > 200000 or int(max_salary) > 200000:
                continue
            period_match = re.search(r'(MONTH|YEAR|WEEK|DAY|HOUR|ANNUAL|MONTHLY|WEEKLY)', window, flags=re.IGNORECASE)
            raw_period = period_match.group(1) if period_match else infer_period_by_amount(min_salary)
            period = normalize_period(raw_period)
            return f"{min_salary}-{max_salary}-{currency}-{period}"

        # Match single salary
        single_match = re.search(r'\$?\b(\d{2,6})\b', window)
        if single_match:
            min_salary = max_salary = single_match.group(1)
            if int(min_salary) > 200000:
                continue
            period_match = re.search(r'(MONTH|YEAR|WEEK|DAY|HOUR|ANNUAL|MONTHLY|WEEKLY)', window, flags=re.IGNORECASE)
            raw_period = period_match.group(1) if period_match else infer_period_by_amount(min_salary)
            period = normalize_period(raw_period)
            return f"{min_salary}-{max_salary}-{currency}-{period}"

    # Fallback ONLY if no anchors
    if not anchor_positions:
        generic_patterns = [
            r'\s*(?P<min>\d{4,6})\s*(?:[-‚Äì‚Äî]|TO)+\s*(?P<max>\d{4,6})[^A-Z]*(?P<period>MONTH|YEAR|WEEK|DAY|HOUR|ANNUAL|MONTHLY|WEEKLY)?',
            r'\s*(?P<min>\d{4,6})[^A-Z0-9]*(?P<period>MONTH|YEAR|WEEK|DAY|HOUR|ANNUAL|MONTHLY|WEEKLY)?'
        ]

        for pattern in generic_patterns:
            match = re.search(pattern, text, flags=re.IGNORECASE)
            if match:
                min_salary = match.group("min")
                max_salary = match.groupdict().get("max") or min_salary
                if int(min_salary) > 200000 or int(max_salary) > 200000:
                    continue

                # Avoid phone numbers
                start = text.find(min_salary)
                end = text.find(max_salary) + len(max_salary) if max_salary else start + 5
                surrounding = text[max(0, start-20):min(len(text), end+20)]

                if re.search(r'(TEL|HOTLINE|ÈõªË©±|CONTACT|WHATSAPP|RECRUIT)', surrounding, flags=re.IGNORECASE):
                    continue
                if re.search(r'\b\d{4}\s+\d{4}\b', surrounding):
                    continue

                raw_period = match.groupdict().get("period") or infer_period_by_amount(min_salary)
                period = normalize_period(raw_period)
                return f"{min_salary}-{max_salary}-{currency}-{period}"

    return "0-0-None-None"

# Apply extractor
df['predicted_salary'] = df.apply(
    lambda row: extract_salary_with_inference(
        f"{row['job_title']} {row['job_ad_details']}",
        row['nation_short_desc']
    ),
    axis=1
)

# Accuracy
accuracy = np.mean(df['predicted_salary'] == df['y_true'])
print(f"\nüåç Currency-Informed Baseline Accuracy: {accuracy:.2%}")

# Print prediction vs ground truth
print("\nüîç Prediction vs Ground Truth:\n")
for i, row in df.iterrows():
    predicted = row['predicted_salary']
    expected = row['y_true']
    if predicted != expected:
        print(f"[{i}] ‚ùå Predicted: {predicted} | Expected: {expected}")
    else:
        print(f"[{i}] ‚úÖ Matched:   {predicted}")

# Save predictions
df.to_csv("improved_salary_predictions.csv", index=False)



üåç Currency-Informed Baseline Accuracy: 49.05%

üîç Prediction vs Ground Truth:

[0] ‚úÖ Matched:   17500-17500-PHP-MONTHLY
[1] ‚úÖ Matched:   16000-16000-PHP-MONTHLY
[2] ‚úÖ Matched:   0-0-None-None
[3] ‚úÖ Matched:   0-0-None-None
[4] ‚úÖ Matched:   0-0-None-None
[5] ‚úÖ Matched:   50-60-HKD-HOURLY
[6] ‚úÖ Matched:   0-0-None-None
[7] ‚úÖ Matched:   16000-16000-PHP-MONTHLY
[8] ‚úÖ Matched:   17500-17500-PHP-MONTHLY
[9] ‚úÖ Matched:   32-32-NZD-HOURLY
[10] ‚ùå Predicted: 2800-2800-MYR-MONTHLY | Expected: 2800-3200-MYR-MONTHLY
[11] ‚ùå Predicted: 002526-002526-HKD-MONTHLY | Expected: 65-65-HKD-HOURLY
[12] ‚ùå Predicted: 28-28-NZD-HOURLY | Expected: 28-30-NZD-HOURLY
[13] ‚úÖ Matched:   0-0-None-None
[14] ‚úÖ Matched:   0-0-None-None
[15] ‚úÖ Matched:   0-0-None-None
[16] ‚úÖ Matched:   35-35-AUD-HOURLY
[17] ‚úÖ Matched:   6000-6000-MYR-MONTHLY
[18] ‚ùå Predicted: 00-75-HKD-HOURLY | Expected: 65-75-HKD-HOURLY
[19] ‚ùå Predicted: 8426-8426-SGD-MONTHLY | Expected: 0-0-None-None
[20] ‚ù