In [None]:
import pandas as pd
import re
import torch
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

# Load dataset
df = pd.read_csv('../DATASETS/salary_labelled_development_set.csv', encoding='utf-8')
print(df.info())
print(df.head())

# Inspect y_true column
print(df['y_true'].dropna().unique()[:10])
print(df['y_true'].describe())

# Analyze salary text length
df['salary_length'] = df['y_true'].astype(str).apply(len)
print(df['salary_length'].describe())

# Clean HTML tags from job ad text
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text()

df['cleaned_ad_details'] = df['job_ad_details'].astype(str).apply(clean_html_tags)

# Load multilingual BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()

# Clean up text for rule-based extraction
def clean_text(text):
    if pd.isna(text):
        return ""
    cleaned = re.sub(r'<[^>]+>', '', text)              # Remove HTML tags
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()      # Normalize whitespace
    return cleaned

# Rule-based salary extraction
def extract_salary_rule(text):
    pattern_range = r'(\d{3,6}\s*[-~～]\s*\d{3,6})(元|PHP|人民币)?'
    pattern_single = r'(\d{3,6})(元|PHP|人民币)?'

    match = re.search(pattern_range, text)
    if match:
        return match.group(1).strip()
    match = re.search(pattern_single, text)
    if match:
        return match.group(1).strip()
    return ""

# Get BERT embedding of a keyword (e.g. 'contact')
def get_keyword_embedding(keyword):
    inputs_kw = tokenizer(keyword, return_tensors="pt")
    with torch.no_grad():
        outputs_kw = model(**inputs_kw)
    return outputs_kw.last_hidden_state.mean(dim=1)  # Shape: (1, hidden_dim)

# Extract potential contact info based on similarity to the keyword
def extract_contact_candidate(last_hidden_states, tokens, keyword, sim_threshold=0.5):
    kw_vector = get_keyword_embedding(keyword)
    token_vectors = last_hidden_states.squeeze(0)
    sims = cosine_similarity(token_vectors.numpy(), kw_vector.numpy()).squeeze()
    candidate_indices = [i for i, sim in enumerate(sims) if sim > sim_threshold]
    candidate_tokens = [tokens[i] for i in candidate_indices]
    return "".join(candidate_tokens)

# Combine salary and contact info extraction
def extract_ad_info(ad_text):
    cleaned = clean_text(ad_text)
    salary_info = extract_salary_rule(cleaned)

    inputs = tokenizer(cleaned, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

    contact_candidate_zh = extract_contact_candidate(last_hidden_states, tokens, "联系方式", sim_threshold=0.5)
    contact_candidate_en = extract_contact_candidate(last_hidden_states, tokens, "contact", sim_threshold=0.5)
    contact_candidate = contact_candidate_zh + " / " + contact_candidate_en if (contact_candidate_zh or contact_candidate_en) else ""

    return {
        "salary_extracted": salary_info,
        "contact_candidate": contact_candidate
    }

# Apply to all job ads with progress bar
tqdm.pandas()
extracted_info = df['cleaned_ad_details'].progress_apply(extract_ad_info)

# Merge extracted info into original dataframe
info_df = pd.json_normalize(extracted_info)
df = pd.concat([df, info_df], axis=1)
