In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import re
from bs4 import BeautifulSoup
from transformers import BertTokenizerFast, BertModel
from sklearn.preprocessing import LabelEncoder

# === Constants ===
START_TAG = "<START>"
STOP_TAG = "<END>"
LABELS = ["B-SALARY", "I-SALARY", "O"]

# === Helper Functions ===
def parse_actual_info(info_str):
    parts = info_str.split('-')
    if len(parts) != 4 or parts == ['0', '0', 'None', 'None']:
        return None
    return (float(parts[0]), float(parts[1]), parts[2], parts[3].lower())

def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text()

def clean_text(text):
    if pd.isna(text):
        return ""
    cleaned = re.sub(r'<[^>]+>', '', text)           
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()   
    return cleaned

def chunk_and_align(text, min_salary, max_salary, tokenizer, max_length=512, stride=128):
    tokens_all, labels_all, embeddings_all = [], [], []
    inputs = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_tensors="pt",
        padding="max_length"
    )

    overflow_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mappings = inputs.pop("offset_mapping")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    for i in range(len(input_ids)):
        chunk_offsets = offset_mappings[i].tolist()
        chunk_input_ids = input_ids[i]
        chunk_tokens = tokenizer.convert_ids_to_tokens(chunk_input_ids)
        word_ids = inputs.word_ids(i)

        labels = []
        for token, offset, word_id in zip(chunk_tokens, chunk_offsets, word_ids):
            if word_id is None or offset == [0, 0]:
                labels.append("O")
                continue

            word = text[offset[0]:offset[1]]
            value_str = re.sub(r'[^\d.]', '', word)
            if not re.fullmatch(r'\d+(\.\d+)?', value_str):  # e.g. "123", "12.5"
                labels.append("O")
                continue

            value = float(value_str)

            if abs(value - min_salary) < 1e-3:
                labels.append("B-SALARY")
            elif abs(value - max_salary) < 1e-3:
                if labels and labels[-1] == "B-SALARY":
                    labels.append("I-SALARY")
                else:
                    labels.append("B-SALARY")
            else:
                labels.append("O")

        input_dict = {
            "input_ids": chunk_input_ids.unsqueeze(0).to(device),
            "attention_mask": attention_mask[i].unsqueeze(0).to(device)
        }
        with torch.no_grad():
            outputs = bert_model(**input_dict)
            embeddings = outputs.last_hidden_state.squeeze(0).cpu()

        tokens_all.append(chunk_tokens)
        labels_all.append(labels)
        embeddings_all.append(embeddings)

    return tokens_all, labels_all, embeddings_all

def log_sum_exp(vec):
    max_score = vec[0, torch.argmax(vec, 1)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def prepare_sequence(embeds):
    return embeds.view(len(embeds), 1, -1)

# === BiLSTM+CRF Model ===
class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tag_to_ix):
        super(BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
        forward_var = init_alphas

        for feat in feats:
            alphas_t = []
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, embeds):
        self.hidden = self.init_hidden()
        lstm_out, _ = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(embeds), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score += self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score += self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        forward_var = init_vvars

        for feat in feats:
            bptrs_t, viterbivars_t = [], []
            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = torch.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = torch.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, embeds, tags):
        feats = self._get_lstm_features(embeds)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, embeds):
        lstm_feats = self._get_lstm_features(embeds)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# === Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nation_currency = {"PH": "PHP", "NZ": "NZD", "AUS": "AUD", "HK": "HKD", "ID": "IDR", "MY": "MYR", "SG": "SGD", "TH": "THB"}
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device).eval()
for param in bert_model.parameters():
    param.requires_grad = False

# === Data Preprocessing ===
dev_data = pd.read_csv("/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv")
dev_data['currency'] = dev_data.iloc[:, 3].map(nation_currency)
dev_data['parsed'] = dev_data.iloc[:, 5].apply(parse_actual_info)
dev_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(dev_data['parsed'].tolist(), index=dev_data.index)
dev_data['cleaned_ad_details'] = dev_data['job_ad_details'].astype(str).apply(clean_html_tags).apply(clean_text)

# === Build Training Set ===
tag_to_ix = {label: i for i, label in enumerate(LABELS)}
tag_to_ix[START_TAG] = len(tag_to_ix)
tag_to_ix[STOP_TAG] = len(tag_to_ix)

X_train, Y_train = [], []

for idx, row in dev_data.iterrows():
    job_text = row['cleaned_ad_details']
    parsed = row['parsed']
    if not parsed:
        continue
    min_salary, max_salary = parsed[0], parsed[1]

    token_chunks, label_chunks, embed_chunks = chunk_and_align(job_text, min_salary, max_salary, tokenizer)

    for labels, embeddings in zip(label_chunks, embed_chunks):
        if len(labels) != embeddings.shape[0]:
            continue
        X_train.append(embeddings)
        Y_train.append(torch.tensor([tag_to_ix[lbl] for lbl in labels], dtype=torch.long))

# === Train Model ===
model = BiLSTM_CRF(embedding_dim=768, hidden_dim=128, tag_to_ix=tag_to_ix).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(10):
    total_loss = 0.0
    for x, y in zip(X_train, Y_train):
        model.zero_grad()
        feats = prepare_sequence(x).to(device)
        y = y.to(device)
        loss = model.neg_log_likelihood(feats, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


In [None]:
def extract_span_from_tags(tokens, tags):
    span_tokens = []
    inside = False
    for token, tag in zip(tokens, tags):
        if tag == "B-SALARY":
            span_tokens = [token]
            inside = True
        elif tag == "I-SALARY" and inside:
            span_tokens.append(token)
        elif inside:
            break
    return tokenizer.convert_tokens_to_string(span_tokens).strip()

def evaluate_on_test_set(test_csv_path):
    test_data = pd.read_csv(test_csv_path)
    test_data['parsed'] = test_data.iloc[:, 5].apply(parse_actual_info)
    test_data['cleaned_ad_details'] = test_data['job_ad_details'].astype(str).apply(clean_html_tags).apply(clean_text)

    correct = 0
    total = 0

    print("\n=== Test Set Evaluation ===\n")
    for idx, row in test_data.iterrows():
        job_id = row['job_id']
        job_text = row['cleaned_ad_details']
        y_true = row['y_true']
        parsed = row['parsed']
        if not parsed:
            continue

        min_salary, max_salary = parsed[0], parsed[1]
        token_chunks, _, embed_chunks = chunk_and_align(job_text, min_salary, max_salary, tokenizer)

        best_prediction = "NONE"
        for tokens, embeddings in zip(token_chunks, embed_chunks):
            with torch.no_grad():
                feats = prepare_sequence(embeddings).to(device)
                _, pred_ids = model(feats)
            ix_to_tag = {v: k for k, v in tag_to_ix.items()}
            pred_tags = [ix_to_tag[i.item()] for i in pred_ids]
            pred_span = extract_span_from_tags(tokens, pred_tags)
            if pred_span:
                y_parts = y_true.split('-')
                currency_unit = f"{y_parts[2]}-{y_parts[3]}" if len(y_parts) == 4 else "UNKNOWN-UNKNOWN"
                best_prediction = f"{pred_span}-{currency_unit}"
                break

        match = (best_prediction.replace(" ", "").lower() in y_true.replace(" ", "").lower())
        status = "✅" if match else "❌"
        print(f"[{status}] Job ID {job_id} | Predicted: '{best_prediction}' | Expected: '{y_true}'")

        total += 1
        if match:
            correct += 1

    accuracy = correct / total if total > 0 else 0.0
    print(f"\nAccuracy: {correct}/{total} = {accuracy:.2%}\n")


# Run Evaluation
evaluate_on_test_set("/Users/eddiezhang/Downloads/job_data_files/salary_labelled_test_set.csv")

In [None]:
# import pandas as pd
# import numpy as np
# import torch
# import re
# from bs4 import BeautifulSoup
# from transformers import BertTokenizerFast, BertModel
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder

# # === Helper Functions === 
# def parse_actual_info(info_str):
#     parts = info_str.split('-')
#     if len(parts) != 4 or parts == ['0', '0', 'None', 'None']:
#         return None
#     return (float(parts[0]), float(parts[1]), parts[2], parts[3].lower())

# # Removes html tags using BeautifulSoup
# def clean_html_tags(html_text):
#     soup = BeautifulSoup(html_text, 'html.parser')
#     return soup.get_text()

# # Cleans text (removing html tags that may have been missed, reducing whitespaces)
# def clean_text(text):
#     if pd.isna(text):
#         return ""
#     cleaned = re.sub(r'<[^>]+>', '', text)           
#     cleaned = re.sub(r'\s+', ' ', cleaned).strip()   
#     return cleaned

# def get_aligned_tokens_and_labels(text, min_salary, max_salary):
#     inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=4096, return_tensors="pt")
#     offsets = inputs['offset_mapping'][0].tolist()
#     input_ids = inputs['input_ids'][0]
#     tokens = tokenizer.convert_ids_to_tokens(input_ids)
#     word_ids = inputs.word_ids()
    
#     labels = []
#     for token, offset, word_id in zip(tokens, offsets, word_ids):
#         if word_id is None or offset == [0, 0]:
#             labels.append("O")
#             continue
#         word = text[offset[0]:offset[1]]
#         try:
#             value = float(re.sub(r'[^\d.]', '', word))
#             if min_salary <= value <= max_salary:
#                 if labels and labels[-1] in ["B-SALARY", "I-SALARY"]:
#                     labels.append("I-SALARY")
#                 else:
#                     labels.append("B-SALARY")
#             else:
#                 labels.append("O")
#         except:
#             labels.append("O")
#     return tokens, labels, inputs

# def get_token_embeddings_from_inputs(inputs, model, device):
#     """
#     Returns token-level contextual embeddings from a model given tokenizer outputs.
#     Strips keys that the model does not use (e.g., offset_mapping).
#     """
#     inputs_model = {k: v.to(device) for k, v in inputs.items() if k != 'offset_mapping'}
#     with torch.no_grad():
#         outputs = model(**inputs_model)
#         embeddings = outputs.last_hidden_state.squeeze(0)
#     return embeddings

# def extract_span(tokens, labels):
#     span_tokens = []
#     inside = False
#     for token, label in zip(tokens, labels):
#         if label == "B-SALARY":
#             span_tokens = [token]
#             inside = True
#         elif label == "I-SALARY" and inside:
#             span_tokens.append(token)
#         elif inside:
#             break
#     return tokenizer.convert_tokens_to_string(span_tokens)

# # === Setup ===

# device = torch.device("cpu")

# nation_currency = {
#     "PH": "PHP", 
#     "NZ": "NZD", 
#     "AUS": "AUD", 
#     "HK": "HKD",
#     "ID": "IDR", 
#     "MY": "MYR", 
#     "SG": "SGD", 
#     "TH": "THB"
# }

# # Load multilingual BERT model and tokenizer
# model_name = "bert-base-multilingual-cased"
# tokenizer = BertTokenizerFast.from_pretrained(model_name)
# model = BertModel.from_pretrained(model_name)
# model.eval()

# class BiLSTM_CRF(nn.Module):

#     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
#         super(BiLSTM_CRF, self).__init__()
#         self.embedding_dim = embedding_dim
#         self.hidden_dim = hidden_dim
#         self.vocab_size = vocab_size
#         self.tag_to_ix = tag_to_ix 
#         self.tagset_size = len(tag_to_ix)

#         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
#                             num_layers=1, bidirectional=True)

#         self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) #TODO: Challenge: Why is this layer required? (DONE)
#         # Layer is required to transform the hidden states into a space where each dimension corresponds to a particular tag's emission score

#         # Matrix of transition parameters.  Entry i,j is the score of
#         # transitioning *to* i *from* j.
#         self.transitions = nn.Parameter(
#             torch.randn(self.tagset_size, self.tagset_size))

#         self.transitions.data[tag_to_ix[START_TAG], :] = -10000
#         self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

#         self.hidden = self.init_hidden()

#     def init_hidden(self):
#         return (torch.randn(2, 1, self.hidden_dim // 2),
#                 torch.randn(2, 1, self.hidden_dim // 2))

#     def _forward_alg(self, feats):
#         # Do the forward algorithm to compute the partition function
#         init_alphas = torch.full((1, self.tagset_size), -10000.)
#         # START_TAG has all of the score.
#         init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

#         # Wrap in a variable so that we will get automatic backprop
#         forward_var = init_alphas

#         # Iterate through the sentence
#         for feat in feats:
#             alphas_t = []  # The forward tensors at this timestep
#             for next_tag in range(self.tagset_size):
#                 # broadcast the emission score: it is the same regardless of
#                 # the previous tag
#                 emit_score = feat[next_tag].view(
#                     1, -1).expand(1, self.tagset_size)
#                 # the ith entry of trans_score is the score of transitioning to
#                 # next_tag from i
#                 trans_score = self.transitions[next_tag].view(1, -1)
#                 # The ith entry of next_tag_var is the value for the
#                 # edge (i -> next_tag) before we do log-sum-exp
#                 next_tag_var = forward_var + trans_score + emit_score
#                 # The forward variable for this tag is log-sum-exp of all the
#                 # scores.
#                 alphas_t.append(log_sum_exp(next_tag_var).view(1))
#             forward_var = torch.cat(alphas_t).view(1, -1)
#         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
#         alpha = log_sum_exp(terminal_var)
#         return alpha

#     def _get_lstm_features(self, sentence):
#         self.hidden = self.init_hidden()
#         embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
#         lstm_out, self.hidden = self.lstm(embeds, self.hidden)
#         lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
#         lstm_feats = self.hidden2tag(lstm_out) 
#         return lstm_feats

#     def _score_sentence(self, feats, tags):
#         # Gives the score of a provided tag sequence
#         score = torch.zeros(1)
#         tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
#         for i, feat in enumerate(feats):
#             score = score + \
#                 self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
#         score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
#         return score

#     def _viterbi_decode(self, feats):
#         backpointers = []

#         # Initialize the viterbi variables in log space
#         init_vvars = torch.full((1, self.tagset_size), -10000.)
#         init_vvars[0][self.tag_to_ix[START_TAG]] = 0

#         # forward_var at step i holds the viterbi variables for step i-1
#         forward_var = init_vvars
#         for feat in feats:
#             bptrs_t = []  # holds the backpointers for this step
#             viterbivars_t = []  # holds the viterbi variables for this step

#             for next_tag in range(self.tagset_size):
#                 # next_tag_var[i] holds the viterbi variable for tag i at the
#                 # previous step, plus the score of transitioning
#                 # from tag i to next_tag.
#                 # We don't include the emission scores here because the max
#                 # does not depend on them (we add them in below)
#                 next_tag_var = forward_var + self.transitions[next_tag]
#                 best_tag_id = argmax(next_tag_var)
#                 bptrs_t.append(best_tag_id)
#                 viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
#             # Now add in the emission scores, and assign forward_var to the set
#             # of viterbi variables we just computed
#             forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
#             backpointers.append(bptrs_t)

#         # Transition to STOP_TAG
#         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
#         best_tag_id = argmax(terminal_var)
#         path_score = terminal_var[0][best_tag_id]

#         # Follow the back pointers to decode the best path.
#         best_path = [best_tag_id]
#         for bptrs_t in reversed(backpointers):
#             best_tag_id = bptrs_t[best_tag_id]
#             best_path.append(best_tag_id)
#         # Pop off the start tag (we dont want to return that to the caller)
#         start = best_path.pop()
#         assert start == self.tag_to_ix[START_TAG]  # Sanity check
#         best_path.reverse()
#         return path_score, best_path

#     def neg_log_likelihood(self, sentence, tags):
#         feats = self._get_lstm_features(sentence)
#         forward_score = self._forward_alg(feats)
#         gold_score = self._score_sentence(feats, tags)
#         return forward_score - gold_score

#     def forward(self, sentence):  # dont confuse this with _forward_alg above.       
#         lstm_feats = self._get_lstm_features(sentence)
#         score, tag_seq = self._viterbi_decode(lstm_feats)
#         return score, tag_seq

# # === Preprocessing ===

# dev_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv')

# # Parse expected salary value for easier comparison
# dev_data['currency'] = dev_data.iloc[:, 3].map(nation_currency)
# dev_data['parsed'] = dev_data.iloc[:, 5].apply(parse_actual_info)
# dev_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(dev_data['parsed'].tolist(), index=dev_data.index)

# # Cleaning job ad details to remove html tags and additional whitespaces
# dev_data['cleaned_ad_details'] = dev_data['job_ad_details'].astype(str).apply(clean_html_tags).apply(clean_text)

# # === Training ===

# train_embeddings, train_labels = [], []

# for idx, row in dev_data.iterrows():
#     job_text = row.iloc[2]
#     min_salary = row['min_salary']
#     max_salary = row['max_salary']

#     tokens, labels, inputs = get_aligned_tokens_and_labels(job_text, min_salary, max_salary)

#     if "B-SALARY" not in labels:
#         continue

#     embeddings = get_token_embeddings_from_inputs(inputs, model, device)

#     if len(labels) != embeddings.shape[0]:
#         print(f"[SKIP {idx}] Mismatch: {len(labels)} labels vs {embeddings.shape[0]} embeddings")
#         continue

#     train_embeddings.extend(embeddings.numpy())
#     train_labels.extend(labels)

# train_embeddings = np.array(train_embeddings)
# label_encoder = LabelEncoder()
# encoded_train_labels = label_encoder.fit_transform(train_labels)

# clf = LogisticRegression(max_iter=1000)
# clf.fit(train_embeddings, encoded_train_labels)



In [None]:
# import pandas as pd
# import numpy as np
# import torch
# import re
# from transformers import LongformerTokenizerFast, LongformerModel
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report

# # === Helper Functions ===

# def get_aligned_tokens_and_labels(text, min_salary, max_salary):
#     inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=4096, return_tensors="pt")
#     offsets = inputs['offset_mapping'][0].tolist()
#     input_ids = inputs['input_ids'][0]
#     tokens = tokenizer.convert_ids_to_tokens(input_ids)
#     word_ids = inputs.word_ids()
    
#     labels = []
#     for token, offset, word_id in zip(tokens, offsets, word_ids):
#         if word_id is None or offset == [0, 0]:
#             labels.append("O")
#             continue
#         word = text[offset[0]:offset[1]]
#         try:
#             value = float(re.sub(r'[^\d.]', '', word))
#             if min_salary <= value <= max_salary:
#                 if labels and labels[-1] in ["B-SALARY", "I-SALARY"]:
#                     labels.append("I-SALARY")
#                 else:
#                     labels.append("B-SALARY")
#             else:
#                 labels.append("O")
#         except:
#             labels.append("O")
#     return tokens, labels, inputs

# def get_token_embeddings_from_inputs(inputs, model, device):
#     """
#     Returns token-level contextual embeddings from a model given tokenizer outputs.
#     Strips keys that the model does not use (e.g., offset_mapping).
#     """
#     inputs_model = {k: v.to(device) for k, v in inputs.items() if k != 'offset_mapping'}
#     with torch.no_grad():
#         outputs = model(**inputs_model)
#         embeddings = outputs.last_hidden_state.squeeze(0)
#     return embeddings

# def extract_span(tokens, labels):
#     span_tokens = []
#     inside = False
#     for token, label in zip(tokens, labels):
#         if label == "B-SALARY":
#             span_tokens = [token]
#             inside = True
#         elif label == "I-SALARY" and inside:
#             span_tokens.append(token)
#         elif inside:
#             break
#     return tokenizer.convert_tokens_to_string(span_tokens)

# # === Device setup ===
# device = torch.device("cpu")

# # === Load datasets ===
# dev_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_development_set.csv')
# test_data = pd.read_csv('/Users/eddiezhang/Downloads/job_data_files/salary_labelled_test_set.csv')

# # === Currency map ===
# nation_currency = {
#     "PH": "PHP", "NZ": "NZD", "AUS": "AUD", "HK": "HKD",
#     "ID": "IDR", "MY": "MYR", "SG": "SGD", "TH": "THB"
# }
# dev_data['currency'] = dev_data.iloc[:, 3].map(nation_currency)
# test_data['currency'] = test_data.iloc[:, 3].map(nation_currency)

# # === Parse & clean ===
# dev_data['parsed'] = dev_data.iloc[:, 5].apply(parse_actual_info)
# test_data['parsed'] = test_data.iloc[:, 5].apply(parse_actual_info)
# dev_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(dev_data['parsed'].tolist(), index=dev_data.index)
# test_data[['min_salary', 'max_salary', 'currency', 'unit']] = pd.DataFrame(test_data['parsed'].tolist(), index=test_data.index)

# # === Load Longformer ===
# tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
# model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
# model.to(device)
# model.eval()

# # === Training ===
# train_embeddings, train_labels = [], []

# for idx, row in dev_data.iterrows():
#     job_text = row.iloc[2]
#     min_salary = row['min_salary']
#     max_salary = row['max_salary']

#     tokens, labels, inputs = get_aligned_tokens_and_labels(job_text, min_salary, max_salary)

#     if "B-SALARY" not in labels:
#         continue

#     embeddings = get_token_embeddings_from_inputs(inputs, model, device)

#     if len(labels) != embeddings.shape[0]:
#         print(f"[SKIP {idx}] Mismatch: {len(labels)} labels vs {embeddings.shape[0]} embeddings")
#         continue

#     train_embeddings.extend(embeddings.numpy())
#     train_labels.extend(labels)

# train_embeddings = np.array(train_embeddings)
# label_encoder = LabelEncoder()
# encoded_train_labels = label_encoder.fit_transform(train_labels)

# clf = LogisticRegression(max_iter=1000)
# clf.fit(train_embeddings, encoded_train_labels)

# # === Testing ===
# test_embeddings, test_labels = [], []

# for idx, row in test_data.iterrows():
#     job_text = row.iloc[2]
#     min_salary = row['min_salary']
#     max_salary = row['max_salary']

#     tokens, labels, inputs = get_aligned_tokens_and_labels(job_text, min_salary, max_salary)

#     if "B-SALARY" not in labels:
#         continue

#     embeddings = get_token_embeddings_from_inputs(inputs, model, device)

#     if len(labels) != embeddings.shape[0]:
#         continue

#     test_embeddings.extend(embeddings.numpy())
#     test_labels.extend(labels)

# test_embeddings = np.array(test_embeddings)
# encoded_test_labels = label_encoder.transform(test_labels)
# test_preds = clf.predict(test_embeddings)

# # === Evaluation ===
# print(classification_report(encoded_test_labels, test_preds, target_names=label_encoder.classes_))
