In [None]:
import sys
sys.path.append("../") 

from src.dataprep import transformations
import pandas as pd
from jobtools.arguments import ParamsNamespace
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/JobLevelData.xlsx"
params_file = "../src/bert_classifier.params.yml"

In [3]:
data_frame = pd.read_excel(data_path)
params = ParamsNamespace.load(params_file)
feature_columns = params.data.features

In [None]:
data_frame = transformations.remove_empty_rows(data_frame, "Column 1")
data_frame = transformations.set_low_register(data_frame)
data_frame.fillna("", inplace=True)
all_labels = set(data_frame["Column 1"].tolist() + data_frame["Column 2"].tolist() + data_frame["Column 3"].tolist())
# all_labels = transformations.combine_columns(data_frame, feature_columns)

In [5]:

all_labels.discard("")  # Remove empty labels

# Create a one-hot encoded label matrix
def encode_labels(row):
    return [1 if label in row.values else 0 for label in all_labels]

data_frame["Features"] = data_frame[["Column 1", "Column 2", "Column 3", "Column 4"]].apply(encode_labels, axis=1)

# Drop unnecessary columns (keeping Title and Labels)
df = data_frame[["Title", "Features"]]

# Convert to list format
train_texts = data_frame["Title"].tolist()
train_labels = data_frame["Features"].tolist()

In [6]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

In [7]:
# Convert to PyTorch dataset
class JobDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Multi-label requires float32

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [8]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [9]:
train_texts = train_df["Title"].tolist()
valid_texts = valid_df["Title"].tolist()
test_texts = test_df["Title"].tolist()

train_labels = train_df["Features"].tolist()
valid_labels = valid_df["Features"].tolist()
test_labels = test_df["Features"].tolist()

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=64)

train_encodings = tokenize_function(train_texts)
valid_encodings = tokenize_function(valid_texts)
test_encodings = tokenize_function(test_texts)

In [11]:
train_dataset = JobDataset(train_encodings, train_labels)
valid_dataset = JobDataset(valid_encodings, valid_labels)
test_dataset = JobDataset(test_encodings, test_labels)

In [None]:
num_labels = len(all_labels)  # Number of classification categories

# Load BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    num_train_epochs=7,
    weight_decay=0.05,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    eval_dataset=valid_dataset 
)

In [13]:
model.config.hidden_dropout_prob = 0.3
model.config.attention_probs_dropout_prob = 0.3

In [None]:
# Move model to MPS
model.to(device)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
def debug_predictions(model, tokenizer, texts):
    model.eval()
    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
    model.to(device)

    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.to("cpu").numpy()
    probs = torch.sigmoid(torch.tensor(logits))  # Convert logits to probabilities
    preds = (probs > 0.5).int().numpy()  # Threshold at 0.5
    
    print("Raw Logits:", logits)
    print("Probabilities:", probs.numpy())
    print("Binary Predictions:", preds)

    return preds

# Run Debugging
example_texts = ["Chief Information Officer", "VP of Engineering"]
debug_predictions(model, tokenizer, example_texts)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, precision_score, recall_score, hamming_loss, jaccard_score
import numpy as np

def compute_metrics(predictions, true_labels):
    """
    Compute evaluation metrics for multi-label classification.
    """
    # Convert logits to probabilities
    probs = torch.sigmoid(torch.tensor(predictions))
    
    # Convert probabilities to binary labels (threshold=0.5)
    preds = (probs > 0.5).int().numpy()
    true_labels = np.array(true_labels)  # Ensure true labels are numpy array

    # Compute metrics
    metrics = {
        "Accuracy (Subset)": accuracy_score(true_labels, preds),  # Subset accuracy (exact match)
        "Log Loss": log_loss(true_labels, probs.numpy()),  # Log loss (lower is better)
        "F1 Score (Macro)": f1_score(true_labels, preds, average="macro"),  # F1-score across all labels
        "F1 Score (Micro)": f1_score(true_labels, preds, average="micro"),
        "Precision (Macro)": precision_score(true_labels, preds, average="macro"),
        "Recall (Macro)": recall_score(true_labels, preds, average="macro"),
        "Hamming Loss": hamming_loss(true_labels, preds),  # Penalizes incorrect labels
        "Jaccard Score (Macro)": jaccard_score(true_labels, preds, average="macro")
    }

    return metrics

In [None]:
example_titles = ["devops team leader", "human resources director & business partner"]

# Filter DataFrame to get the real labels
real_labels = data_frame[data_frame["Title"].isin(example_titles)][["Title", "Features"]]

# Print real labels
print(real_labels)

In [None]:
def predict_and_evaluate(model, tokenizer, texts, true_labels):
    """
    Perform predictions and compute evaluation metrics.
    """
    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Tokenize inputs
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.to("cpu").numpy()  # Move logits to CPU for processing
    metrics = compute_metrics(logits, true_labels)  # Compute evaluation metrics

    return metrics

# Example Usage
example_texts = ["devops team leader", "human resources director & business partner"]
true_labels = [[1, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]  # Replace with real labels

metrics = predict_and_evaluate(model, tokenizer, example_texts, true_labels)
print(metrics)