<a href="https://colab.research.google.com/github/Datasci266-Final-Project/Job-Listings/blob/main/Analysis%20Notebooks/Combined_Model_Outputs_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Installs

!pip install pydot --quiet
!pip install transformers==4.37.2 --quiet
!pip install -U imbalanced-learn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title Imports
import pandas as pd

import numpy as np
import random
import torch

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import re
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel, AutoTokenizer, TFXLMRobertaModel #"FacebookAI/xlm-roberta-base"

from transformers import logging
logging.set_verbosity_error()

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Set seed to 10

seed_value = 10

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
tf.random.set_seed(seed_value)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/DataSci 266 Project

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1V3QooLePiHR_DaZhbXQhsjmP1Ez5fv5F/DataSci 266 Project


In [None]:
# Dictionaries
level_key = {
    "student_intern": 0,
    "entry": 1,
    "junior": 2,
    "mid": 3,
    "senior": 4,
    "executive": 5,
    np.nan: -1
}

# Define salary_key dictionary
# salary_key = {
#     "<45k": 0,
#     "45-65k": 1,
#     "65-85k": 2,
#     "85-110k": 3,
#     "110-150k": 4,
#     "150-200k": 5,
#     ">200k": 6,
#     np.nan: -1
# }

salary_key = {
    "<50k": 0,
    "50-100k": 1,
    "100-150k": 2,
    "150-200k": 3,
    ">200k": 4,
    np.nan: -1
}

In [None]:
#@title Import pickle files


#import tokenized inputs for each model
with open(r'Model Objects/Tokenized Inputs: Bert Base/test_inputs_bert.pickle', 'rb') as handle:
    bert_base_test_inputs = pickle.load(handle)


with open(r'Model Objects/Tokenized Inputs: xlm-RoBERTa/test_inputs_xlm_roberta.pickle', 'rb') as handle:
    xlm_roberta_test_inputs = pickle.load(handle)


# import data
with open(r'Model Objects/model_2_test_data.pickle', 'rb') as handle:
    test_data = pickle.load(handle)


# import labels
with open(r'Model Objects/Labels/model_2_labels_level_test.pickle', 'rb') as handle:
    test_level_labels = pickle.load(handle)

with open(r'Model Objects/Labels/model_2_labels_salary_test.pickle', 'rb') as handle:
    test_salary_labels = pickle.load(handle)






In [None]:
MAX_SEQUENCE_LENGTH = 512

In [None]:
#@title Define Bert Model

# Masked Loss Function
def masked_loss_function(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))  # Use -1 to represent missing values
    mask = tf.cast(mask, tf.float32)

    y_true = tf.cast(y_true, tf.float32)  # Cast y_true to float32 to match the type of y_pred
    y_pred = tf.cast(y_pred, tf.float32)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true * mask, y_pred * mask)




# BERT Model
def create_bert_multi_output_model(bert_base_model,
                                   trainable=True,
                                 max_sequence_length=MAX_SEQUENCE_LENGTH,
                                   num_level_classes=7,
                                   num_salary_classes=6,
                                 base_hidden_size=200,
                                  level_hidden_size=200,
                                  salary_hidden_size=200,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                   LEVEL_WEIGHT=0.5,
                                   SALARY_WEIGHT=0.5):

    bert_base_model.trainable = trainable

    # Input layers
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    bert_out = bert_base_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]  # Extract CLS token
    pooler_token = bert_out[1]

    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_1')(cls_token)
    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_2')(base_hidden)
    base_out = tf.keras.layers.Dropout(dropout, name='base_dropout_1')(base_hidden)

    # job experience level
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_1')(base_out)
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_2')(level_hidden)
    level_hidden = tf.keras.layers.Dropout(dropout, name='level_dropout_1')(level_hidden)
    level_classification = tf.keras.layers.Dense(num_level_classes, activation='softmax', name='level_classification')(level_hidden)

    # salary bucket
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_1')(base_out)
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_2')(salary_hidden)
    salary_hidden = tf.keras.layers.Dropout(dropout, name='salary_dropout_1')(salary_hidden)
    salary_classification = tf.keras.layers.Dense(num_salary_classes, activation='softmax', name='salary_classification')(salary_hidden)



    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask],
                                          outputs=[level_classification, salary_classification])


    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss={'level_classification': masked_loss_function,
                                       'salary_classification': masked_loss_function},
                                 loss_weights=[LEVEL_WEIGHT,
                                               SALARY_WEIGHT],
                                 metrics={'level_classification': 'accuracy',
                                          'salary_classification': 'accuracy'})

    return classification_model




In [None]:
#@title Define xlm-roberta Model

# Masked Loss Function
def masked_loss_function(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))  # Use -1 to represent missing values
    mask = tf.cast(mask, tf.float32)

    y_true = tf.cast(y_true, tf.float32)  # Cast y_true to float32 to match the type of y_pred
    y_pred = tf.cast(y_pred, tf.float32)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true * mask, y_pred * mask)




# BERT Model
def create_xlm_roberta_multi_output_model(xlm_roberta_model,
                                   trainable=True,
                                 max_sequence_length=MAX_SEQUENCE_LENGTH,
                                   num_level_classes=7,
                                   num_salary_classes=6,
                                 base_hidden_size=200,
                                  level_hidden_size=200,
                                  salary_hidden_size=200,
                                 dropout=0.3,
                                 learning_rate=0.00001,
                                   LEVEL_WEIGHT=0.5,
                                   SALARY_WEIGHT=0.5):

    xlm_roberta_model.trainable = trainable

    # Input layers
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    xlm_roberta_inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

    xlm_roberta_out = xlm_roberta_model(xlm_roberta_inputs)

    cls_token = xlm_roberta_out[0][:, 0, :]  # Extract CLS token
    #pooler_token = bert_out[1]

    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_1')(cls_token)
    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_2')(base_hidden)
    base_out = tf.keras.layers.Dropout(dropout, name='base_dropout_1')(base_hidden)

    # job experience level
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_1')(base_out)
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_2')(level_hidden)
    level_hidden = tf.keras.layers.Dropout(dropout, name='level_dropout_1')(level_hidden)
    level_classification = tf.keras.layers.Dense(num_level_classes, activation='softmax', name='level_classification')(level_hidden)

    # salary bucket
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_1')(base_out)
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_2')(salary_hidden)
    salary_hidden = tf.keras.layers.Dropout(dropout, name='salary_dropout_1')(salary_hidden)
    salary_classification = tf.keras.layers.Dense(num_salary_classes, activation='softmax', name='salary_classification')(salary_hidden)



    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask],
                                          outputs=[level_classification, salary_classification])


    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss={'level_classification': masked_loss_function,
                                       'salary_classification': masked_loss_function},
                                 loss_weights=[LEVEL_WEIGHT,
                                               SALARY_WEIGHT],
                                 metrics={'level_classification': 'accuracy',
                                          'salary_classification': 'accuracy'})

    return classification_model




In [None]:
# bert base model from pre-saved weights
bert_base_model = TFBertModel.from_pretrained('bert-base-cased')
bert_base_model = create_bert_multi_output_model(bert_base_model)

bert_base_model.load_weights('./Model Objects/model_2_bert/model_2_bert_base_weights')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

NameError: name 'create_xlm_roberta_multi_output_model' is not defined

In [None]:
# xlm_roberta model from pre-saved weights
xlm_roberta_model = TFXLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
xlm_roberta_model = create_xlm_roberta_multi_output_model(xlm_roberta_model)

xlm_roberta_model.load_weights('./Model Objects/model_2_xlm_roberta_5_epochs/model_2_xlm_roberta_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x78d230dd0610>

In [None]:
#@title Define JobBert Model

# Masked Loss Function
def masked_loss_function(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))  # Use -1 to represent missing values
    mask = tf.cast(mask, tf.float32)

    y_true = tf.cast(y_true, tf.float32)  # Cast y_true to float32 to match the type of y_pred
    y_pred = tf.cast(y_pred, tf.float32)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true * mask, y_pred * mask)




# BERT Model
def create_jobbert_multi_output_model(bert_base_model,
                                   trainable=True,
                                 max_sequence_length=MAX_SEQUENCE_LENGTH,
                                   num_level_classes=7,
                                   num_salary_classes=6,
                                 base_hidden_size=200,
                                  level_hidden_size=200,
                                  salary_hidden_size=200,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                   LEVEL_WEIGHT=0.5,
                                   SALARY_WEIGHT=0.5):

    bert_base_model.trainable = trainable

    # Input layers
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    bert_out = bert_base_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]  # Extract CLS token
    pooler_token = bert_out[1]

    # base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_1')(cls_token)
    # base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_2')(base_hidden)
    # base_out = tf.keras.layers.Dropout(dropout, name='base_dropout_1')(base_hidden)

    # job experience level
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_1')(cls_token)
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_2')(level_hidden)
    level_hidden = tf.keras.layers.Dropout(dropout, name='level_dropout_1')(level_hidden)
    level_classification = tf.keras.layers.Dense(num_level_classes, activation='softmax', name='level_classification')(level_hidden)

    # salary bucket
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_1')(cls_token)
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_2')(salary_hidden)
    salary_hidden = tf.keras.layers.Dropout(dropout, name='salary_dropout_1')(salary_hidden)
    salary_classification = tf.keras.layers.Dense(num_salary_classes, activation='softmax', name='salary_classification')(salary_hidden)



    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask],
                                          outputs=[level_classification, salary_classification])


    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss={'level_classification': masked_loss_function,
                                       'salary_classification': masked_loss_function},
                                 loss_weights=[LEVEL_WEIGHT,
                                               SALARY_WEIGHT],
                                 metrics={'level_classification': 'accuracy',
                                          'salary_classification': 'accuracy'})

    return classification_model




In [None]:
# jobbert model from pre-saved weights
jobbert_model = TFBertModel.from_pretrained('jjzha/jobbert-base-cased')
jobbert_model = create_jobbert_multi_output_model(jobbert_model)

jobbert_model.load_weights('./Model Objects/model_3_jobbert/model_3_jobbert_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x78d230cf8e80>

In [None]:
#set target level names and target salary names
target_names = ["student_intern", "entry", "junior", "mid", "senior", "executive"]
target_salary_names = ["<50k",
    "50-100k",
    "100-150k",
    "150-200k",
    ">200k"]




In [None]:
predictions_bert_base = bert_base_model.predict(bert_base_test_inputs)
predictions_xlm_roberta = xlm_roberta_model.predict(xlm_roberta_test_inputs)
predictions_jobbert = jobbert_model.predict(bert_base_test_inputs)






In [None]:
level_predictions_bert_base = tf.argmax(predictions_bert_base[0], axis=-1)
salary_predictions_bert_base = tf.argmax(predictions_bert_base[1], axis=-1)


level_predictions_xlm_roberta = tf.argmax(predictions_xlm_roberta[0], axis=-1)
salary_predictions_xlm_roberta = tf.argmax(predictions_xlm_roberta[1], axis=-1)

level_predictions_jobbert = tf.argmax(predictions_jobbert[0], axis=-1)
salary_predictions_jobbert = tf.argmax(predictions_jobbert[1], axis=-1)


In [None]:
test_with_predictions = test_data.copy()
test_with_predictions["level_predictions_bert_base"] = level_predictions_bert_base
test_with_predictions["level_predictions_xlm_roberta"] = level_predictions_xlm_roberta
test_with_predictions["level_predictions_jobbert"] = level_predictions_jobbert
test_with_predictions["salary_predictions_bert_base"] = salary_predictions_bert_base
test_with_predictions["salary_predictions_xlm_roberta"] = salary_predictions_xlm_roberta
test_with_predictions["salary_predictions_jobbert"] = salary_predictions_jobbert

In [None]:
test_with_predictions.head()

Unnamed: 0,index,title,text,level,salary,salary_bucket,level_labels,salary_labels,level_predictions_bert_base,level_predictions_xlm_roberta,level_predictions_jobbert,salary_predictions_bert_base,salary_predictions_xlm_roberta,salary_predictions_jobbert
0,17953,Real Estate Fund Accounting Senior Client Mana...,Credit Suisse is a leading global wealth manag...,senior,,,4,-1,1,5,4,1,2,1
1,97031,Investment Analyst,<p><strong><u>INVESTMENT ANALYST in Luxembourg...,junior,,,2,-1,3,2,3,2,1,1
2,64933,Accounts payable accountant Fr-En (M/F),Job Description LE POSTE : For one of our clie...,,55000.0,50-100k,-1,1,2,1,2,1,0,1
3,79240,Concepteur d'offres de formation - Division du...,Missions Le concepteur d’offre sera en charge ...,entry,,,1,-1,1,4,1,0,1,1
4,89706,SC Analyst,"Description At Amazon Logistics (AMZL), we str...",junior,,,2,-1,4,4,3,1,1,2


In [None]:
test_with_predictions.to_csv('test_data_with_predictions.csv', index=False)

In [None]:
# test_level_analysis_df = test_with_predictions.copy()
# test_level_analysis_df["level_predictions_bert_base"] = level_predictions_bert_base
# test_level_analysis_df["level_predictions_xlm_roberta"] = level_predictions_xlm_roberta
# test_level_analysis_df = test_level_analysis_df[test_level_analysis_df["test_level"] != 6]