In [None]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import LongformerTokenizerFast, LongformerModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# === Helper Functions ===

def parse_actual_info(info_str):
    parts = info_str.split('-')
    if len(parts) != 4 or parts == ['0', '0', 'None', 'None']:
        return None
    return (float(parts[0]), float(parts[1]), parts[2], parts[3].lower())

def get_aligned_tokens_and_labels(text, min_salary, max_salary):
    inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=4096, return_tensors="pt")
    offsets = inputs['offset_mapping'][0].tolist()
    input_ids = inputs['input_ids'][0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    word_ids = inputs.word_ids()
    
    labels = []
    for token, offset, word_id in zip(tokens, offsets, word_ids):
        if word_id is None or offset == [0, 0]:
            labels.append("O")
            continue
        word = text[offset[0]:offset[1]]
        try:
            value = float(re.sub(r'[^\d.]', '', word))
            if min_salary <= value <= max_salary:
                if labels and labels[-1] in ["B-SALARY", "I-SALARY"]:
                    labels.append("I-SALARY")
                else:
                    labels.append("B-SALARY")
            else:
                labels.append("O")
        except:
            labels.append("O")
    return tokens, labels, inputs

def get_token_embeddings_from_inputs(inputs, model, device):
    """
    Returns token-level contextual embeddings from a model given tokenizer outputs.
    Strips keys that the model does not use (e.g., offset_mapping).
    """
    inputs_model = {k: v.to(device) for k, v in inputs.items() if k != 'offset_mapping'}
    with torch.no_grad():
        outputs = model(**inputs_model)
        embeddings = outputs.last_hidden_state.squeeze(0)
    return embeddings

def extract_span(tokens, labels):
    span_tokens = []
    inside = False
    for token, label in zip(tokens, labels):
        if label == "B-SALARY":
            span_tokens = [token]
            inside = True
        elif label == "I-SALARY" and inside:
            span_tokens.append(token)
        elif inside:
            break
    return tokenizer.convert_tokens_to_string(span_tokens)

# === Device setup ===
device = torch.device("cpu")

# === Load datasets ===
dev_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv')
test_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_test_set.csv')

# === Currency map ===
nation_currency = {
    "PH": "PHP", "NZ": "NZD", "AUS": "AUD", "HK": "HKD",
    "ID": "IDR", "MY": "MYR", "SG": "SGD", "TH": "THB"
}
dev_data['currency'] = dev_data.iloc[:, 3].map(nation_currency)
test_data['currency'] = test_data.iloc[:, 3].map(nation_currency)

# === Parse & clean ===
dev_data['parsed'] = dev_data.iloc[:, 5].apply(parse_actual_info)
test_data['parsed'] = test_data.iloc[:, 5].apply(parse_actual_info)
dev_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(dev_data['parsed'].tolist(), index=dev_data.index)
test_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(test_data['parsed'].tolist(), index=test_data.index)

# === Load Longformer ===
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
model.to(device)
model.eval()

# === Training ===
train_embeddings, train_labels = [], []

for idx, row in dev_data.iterrows():
    job_text = row.iloc[2]
    min_salary = row['min_salary']
    max_salary = row['max_salary']

    tokens, labels, inputs = get_aligned_tokens_and_labels(job_text, min_salary, max_salary)

    if "B-SALARY" not in labels:
        continue

    embeddings = get_token_embeddings_from_inputs(inputs, model, device)

    if len(labels) != embeddings.shape[0]:
        print(f"[SKIP {idx}] Mismatch: {len(labels)} labels vs {embeddings.shape[0]} embeddings")
        continue

    train_embeddings.extend(embeddings.numpy())
    train_labels.extend(labels)

train_embeddings = np.array(train_embeddings)
label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_labels)

clf = LogisticRegression(max_iter=1000)
clf.fit(train_embeddings, encoded_train_labels)

# === Testing ===
test_embeddings, test_labels = [], []

for idx, row in test_data.iterrows():
    job_text = row.iloc[2]
    min_salary = row['min_salary']
    max_salary = row['max_salary']

    tokens, labels, inputs = get_aligned_tokens_and_labels(job_text, min_salary, max_salary)

    if "B-SALARY" not in labels:
        continue

    embeddings = get_token_embeddings_from_inputs(inputs, model, device)

    if len(labels) != embeddings.shape[0]:
        continue

    test_embeddings.extend(embeddings.numpy())
    test_labels.extend(labels)

test_embeddings = np.array(test_embeddings)
encoded_test_labels = label_encoder.transform(test_labels)
test_preds = clf.predict(test_embeddings)

# === Evaluation ===
print(classification_report(encoded_test_labels, test_preds, target_names=label_encoder.classes_))
