In [1]:
import sys
sys.path.append("../") 

from src.dataprep import transformations, preprocess
import pandas as pd
from jobtools.arguments import ParamsNamespace
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from src.data.job_dataset import JobDataset
from src.train import trainer
from src.evaluation import evaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "../data/JobLevelData.xlsx"
params_file = "../src/config/bert_classifier.params.yml"

In [3]:
data_frame = pd.read_excel(data_path)
params = ParamsNamespace.load(params_file)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
tokenizer = BertTokenizer.from_pretrained(params.model.baseline.model_name)

In [4]:
data_frame = transformations.remove_empty_rows(data_frame, "Column 1")
data_frame = transformations.set_low_register(data_frame)
data_frame.fillna("", inplace=True)
features = transformations.get_unique_labels(data_frame, params.data.features)
features.discard("")

  return data_frame.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [5]:
data_frame["Features"] = data_frame[params.data.features].apply(lambda row: transformations.encode_data(row, features), axis=1)
df = data_frame[["Title", "Features"]]
train_texts = data_frame["Title"].tolist()
train_features = data_frame["Features"].tolist()

In [6]:
train_df, temp_df = train_test_split(df, test_size=params.data.temp_size, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=params.data.test_size, random_state=42)

In [7]:
train_texts = train_df["Title"].tolist()
valid_texts = valid_df["Title"].tolist()
test_texts = test_df["Title"].tolist()

train_features = train_df["Features"].tolist()
valid_features = valid_df["Features"].tolist()
test_features = test_df["Features"].tolist()

train_encodings = preprocess.tokenize_function(train_texts, tokenizer, params)
valid_encodings = preprocess.tokenize_function(valid_texts, tokenizer, params)
test_encodings = preprocess.tokenize_function(test_texts, tokenizer, params)

train_dataset = JobDataset(train_encodings, train_features, device)
valid_dataset = JobDataset(valid_encodings, valid_features, device)
test_dataset = JobDataset(test_encodings, test_features, device)

In [8]:
num_features = len(features)
print(num_features)

(model, trainer) = trainer.train_bert_classifier(tokenizer, train_dataset, test_dataset, valid_dataset, num_features, device, params)

6


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.668
20,0.5626
30,0.4998
40,0.4495
50,0.4255
60,0.3783
70,0.3649
80,0.3504
90,0.3329
100,0.3052


 Accuracy (Subset)  Log Loss  F1 Score (Macro)  F1 Score (Micro)  Precision (Macro)  Recall (Macro)  Hamming Loss  Jaccard Score (Macro)
          0.928144  0.495995          0.728287          0.932945           0.768346        0.696892      0.022954               0.655962


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
example_texts = ["devops team leader", "human resources director & business partner"]
evaluator.debug_predict_and_evaluate(model, device, tokenizer, data_frame, example_texts)

TypeError: debug_predict_and_evaluate() takes 5 positional arguments but 6 were given