In [None]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# === Device setup ===
device = torch.device("cpu")

# === Load datasets ===
dev_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv')
test_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_test_set.csv')

# === Map country to currency ===
nation_currency = {
    "PH": "PHP", 
    "NZ": "NZD", 
    "AUS": "AUD", 
    "HK": "HKD",
    "ID": "IDR", 
    "MY": "MYR", 
    "SG": "SGD", 
    "TH": "THB"
}
dev_data['currency'] = dev_data.iloc[:, 3].map(nation_currency)
test_data['currency'] = test_data.iloc[:, 3].map(nation_currency)

# === Parse salary info ===
def parse_actual_info(info_str):
    try:
        parts = info_str.split('-')
        if len(parts) != 4 or parts == ['0', '0', 'None', 'None']:
            return None
        return (float(parts[0]), float(parts[1]), parts[2], parts[3].lower())
    except:
        return None

dev_data['parsed'] = dev_data.iloc[:, 5].apply(parse_actual_info)
test_data['parsed'] = test_data.iloc[:, 5].apply(parse_actual_info)
dev_data = dev_data[dev_data['parsed'].notnull()]
test_data = test_data[test_data['parsed'].notnull()]
dev_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(dev_data['parsed'].tolist(), index=dev_data.index)
test_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(test_data['parsed'].tolist(), index=test_data.index)

# === Load BERT ===
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert.to(device)
bert.eval()

# === Helpers ===
def label_tokens(text, salary_span):
    tokens = tokenizer.tokenize(text)
    labels = ['O'] * len(tokens)
    salary_tokens = tokenizer.tokenize(salary_span)
    for i in range(len(tokens)):
        if tokens[i:i+len(salary_tokens)] == salary_tokens:
            labels[i] = 'B-SALARY'
            for j in range(1, len(salary_tokens)):
                labels[i + j] = 'I-SALARY'
            break
    return tokens, labels

def get_token_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.last_hidden_state.squeeze(0)

def extract_span(tokens, labels):
    span_tokens = []
    inside = False
    for token, label in zip(tokens, labels):
        if label == "B-SALARY":
            span_tokens = [token]
            inside = True
        elif label == "I-SALARY" and inside:
            span_tokens.append(token)
        elif inside:
            break
    return tokenizer.convert_tokens_to_string(span_tokens)

def extract_numeric_range(span_str):
    numbers = re.findall(r'\d+(?:\.\d+)?', span_str)
    if len(numbers) == 1:
        val = float(numbers[0])
        return val, val
    elif len(numbers) >= 2:
        return float(numbers[0]), float(numbers[1])
    else:
        return None, None

def predict_currency(text):
    for cur in ["PHP", "NZD", "AUD", "HKD", "IDR", "MYR", "SGD", "THB"]:
        if cur.lower() in text.lower():
            return cur
    return "UNKNOWN"

def predict_timeframe(text):
    keywords = {
        "hourly": ["hour", "hr", "hourly", "每小時", "時薪"],
        "daily": ["day", "daily", "日薪", "每日"],
        "weekly": ["week", "weekly", "周薪", "每週"],
        "monthly": ["month", "monthly", "月薪", "每月"],
        "annual": ["year", "annual", "yearly", "annually", "年薪", "每年"]
    }
    for unit, terms in keywords.items():
        for t in terms:
            if t in text.lower():
                return unit
    return "unknown"

# === Training ===
train_embeddings, train_labels = [], []

for idx, row in dev_data.iterrows():
    job_text = row.iloc[2]
    salary_span = f"{int(row['min_salary'])} to {int(row['max_salary'])}"
    tokens, labels = label_tokens(job_text, salary_span)
    embeddings = get_token_embeddings(job_text)

    if embeddings.shape[0] != len(tokens) or "B-SALARY" not in labels:
        continue

    train_embeddings.extend(embeddings.numpy())
    train_labels.extend(labels)


train_embeddings = np.array(train_embeddings)
label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_labels)
clf = LogisticRegression(max_iter=1000)
clf.fit(train_embeddings, encoded_train_labels)


# === Testing ===
test_embeddings, test_labels = [], []
test_tokens_list, test_true_labels_list = [], []
skipped_test = 0

for idx, row in dev_data.iterrows():
    job_text = row.iloc[2]
    salary_span = f"{int(row['min_salary'])} to {int(row['max_salary'])}"

    if salary_span not in job_text:
        continue

    tokens, labels = label_tokens(job_text, salary_span)
    embeddings = get_token_embeddings(job_text)

    if embeddings.shape[0] != len(tokens):
        continue

    train_embeddings.extend(embeddings.numpy())
    train_labels.extend(labels)

test_preds = clf.predict(test_embeddings)
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}


In [None]:
# === Evaluation: Full prediction vs actual ===
print("\n🧾 Final Structured Predictions by Test Row")
print("-" * 70)

correct_full_match = 0
total_rows = 0
start_idx = 0

for i, row in enumerate(test_data.itertuples()):
    if i >= len(test_tokens_list):
        break

    job_text = row.job_ad_details
    true_currency = getattr(row, "currency")
    true_unit = getattr(row, "unit").lower()
    true_min = int(getattr(row, "min_salary"))
    true_max = int(getattr(row, "max_salary"))

    tokens = test_tokens_list[i]
    true_labels = test_true_labels_list[i]

    
    end_idx = start_idx + len(tokens)
    pred_labels = [id2label[idx] for idx in test_preds[start_idx:end_idx]]
    start_idx = end_idx

    pred_span = extract_span(tokens, pred_labels)
    pred_min, pred_max = extract_numeric_range(pred_span)
    pred_currency = predict_currency(job_text)
    pred_unit = predict_timeframe(job_text)

    if pred_min is not None and pred_max is not None:
        pred_string = f"{int(pred_min)}-{int(pred_max)}-{pred_currency}-{pred_unit}"
    else:
        pred_string = f"0-0-NONE-NONE"

    true_string = f"{true_min}-{true_max}-{true_currency}-{true_unit}"
    is_correct = pred_string.lower() == true_string.lower()

    correct_full_match += int(is_correct)
    total_rows += 1

    print(f"[Row {i}]")
    print(f"Prediction: {pred_string}")
    print(f"Actual:     {true_string}")
    print("✅ CORRECT" if is_correct else "❌ INCORRECT")
    print()

print("🔚 Structured Prediction Summary")
print("-" * 70)
print(f"✔️ Total Test Rows Evaluated:   {total_rows}")
print(f"🎯 Fully Correct Predictions:   {correct_full_match}")
print(f"🎯 Final Match Accuracy:        {correct_full_match / total_rows * 100:.2f}%")
