<a href="https://colab.research.google.com/github/Datasci266-Final-Project/Job-Listings/blob/main/Analysis%20Notebooks/Synthetic_Data_Testing_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Installs

!pip install pydot --quiet
!pip install transformers==4.37.2 --quiet
!pip install -U imbalanced-learn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title Imports
import pandas as pd

import numpy as np
import random
import torch

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import re
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel, AutoTokenizer, TFXLMRobertaModel #"FacebookAI/xlm-roberta-base"

from transformers import logging
logging.set_verbosity_error()

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Set seed to 10

seed_value = 10

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
tf.random.set_seed(seed_value)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/DataSci 266 Project

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1V3QooLePiHR_DaZhbXQhsjmP1Ez5fv5F/DataSci 266 Project


In [None]:
# Dictionaries
level_key = {
    "student_intern": 0,
    "entry": 1,
    "junior": 2,
    "mid": 3,
    "senior": 4,
    "executive": 5,
    np.nan: -1
}

# Define salary_key dictionary
# salary_key = {
#     "<45k": 0,
#     "45-65k": 1,
#     "65-85k": 2,
#     "85-110k": 3,
#     "110-150k": 4,
#     "150-200k": 5,
#     ">200k": 6,
#     np.nan: -1
# }

salary_key = {
    "<50k": 0,
    "50-100k": 1,
    "100-150k": 2,
    "150-200k": 3,
    ">200k": 4,
    np.nan: -1
}

In [None]:
MAX_SEQUENCE_LENGTH = 512

In [None]:
#@title Define JobBert Model

# Masked Loss Function
def masked_loss_function(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))  # Use -1 to represent missing values
    mask = tf.cast(mask, tf.float32)

    y_true = tf.cast(y_true, tf.float32)  # Cast y_true to float32 to match the type of y_pred
    y_pred = tf.cast(y_pred, tf.float32)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true * mask, y_pred * mask)




# BERT Model
def create_bert_multi_output_model(bert_base_model,
                                   trainable=True,
                                 max_sequence_length=MAX_SEQUENCE_LENGTH,
                                   num_level_classes=7,
                                   num_salary_classes=6,
                                 base_hidden_size=200,
                                  level_hidden_size=200,
                                  salary_hidden_size=200,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                   LEVEL_WEIGHT=0.5,
                                   SALARY_WEIGHT=0.5):

    bert_base_model.trainable = trainable

    # Input layers
    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    bert_out = bert_base_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]  # Extract CLS token
    pooler_token = bert_out[1]

    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_1')(cls_token)
    base_hidden = tf.keras.layers.Dense(base_hidden_size, activation='relu', name='base_hidden_layer_2')(base_hidden)
    base_out = tf.keras.layers.Dropout(dropout, name='base_dropout_1')(base_hidden)

    # job experience level
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_1')(base_hidden)
    level_hidden = tf.keras.layers.Dense(level_hidden_size, activation='relu', name='level_hidden_layer_2')(level_hidden)
    level_hidden = tf.keras.layers.Dropout(dropout, name='level_dropout_1')(level_hidden)
    level_classification = tf.keras.layers.Dense(num_level_classes, activation='softmax', name='level_classification')(level_hidden)

    # salary bucket
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_1')(base_hidden)
    salary_hidden = tf.keras.layers.Dense(salary_hidden_size, activation='relu', name='salary_hidden_layer_2')(salary_hidden)
    salary_hidden = tf.keras.layers.Dropout(dropout, name='salary_dropout_1')(salary_hidden)
    salary_classification = tf.keras.layers.Dense(num_salary_classes, activation='softmax', name='salary_classification')(salary_hidden)



    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask],
                                          outputs=[level_classification, salary_classification])


    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss={'level_classification': masked_loss_function,
                                       'salary_classification': masked_loss_function},
                                 loss_weights=[LEVEL_WEIGHT,
                                               SALARY_WEIGHT],
                                 metrics={'level_classification': 'accuracy',
                                          'salary_classification': 'accuracy'})

    return classification_model




In [None]:
# bert base model from pre-saved weights
jobbert_model = TFBertModel.from_pretrained('jjzha/jobbert-base-cased')
jobbert_model = create_bert_multi_output_model(jobbert_model)

jobbert_model.load_weights('./Model Objects/model_3b_jobbert_5_epochs/model_3b_jobbert_weights')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/603 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x789070612fe0>

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

https://www.linkedin.com/jobs/view/3882260535

In [None]:
example_1 = """We Are Seeking a Proactive And Detail-oriented Individual To Join Our Team And Assist With Various Data-related Projects. The Primary Focus Will Be On Enhancing Our Database Resources To Facilitate Effective Territory Planning And Sales Strategies. The Individual Will Work Closely With GTM Team Members On a Variety Of Areas Including

Analyzing, prioritizing, and cleansing account-level data collection and review
Supporting the construction of a central dataset to support rigorous account-level TAM calculations
Providing recommendations on account-level data hygiene best practices.


The ideal candidate will possess strong analytical skills, attention to detail, and the ability to work efficiently both independently and as part of a team. This internship offers valuable hands-on experience within a dynamic business environment, with opportunities for growth and professional development."""

In [None]:
example_2 = """Skydio is the leading US drone company and the world leader in autonomous flight, the key technology for the future of drones and aerial transportation. The Skydio team combines deep expertise in artificial intelligence, best-in-class hardware and software product development, and operational excellence to empower a broader, more diverse audience of drone users - from first responders to insurance claims adjusters, utilities, and more!

We Are Seeking a Proactive And Detail-oriented Individual To Join Our Team And Assist With Various Data-related Projects. The Primary Focus Will Be On Enhancing Our Database Resources To Facilitate Effective Territory Planning And Sales Strategies. The Individual Will Work Closely With GTM Team Members On a Variety Of Areas Including

Analyzing, prioritizing, and cleansing account-level data collection and review
Supporting the construction of a central dataset to support rigorous account-level TAM calculations
Providing recommendations on account-level data hygiene best practices.


The ideal candidate will possess strong analytical skills, attention to detail, and the ability to work efficiently both independently and as part of a team. This internship offers valuable hands-on experience within a dynamic business environment, with opportunities for growth and professional development."""

In [None]:
example_3 = """
Skydio is not merely a company; it is the epitome of aerial innovation and the undisputed titan of the American drone industry. As the world leader in autonomous flight technology, Skydio is shaping the very fabric of the future of drones and aerial transportation. With a team that is nothing short of legendary, Skydio blends profound expertise in artificial intelligence with cutting-edge hardware and software product development, embodying perfection in every flight.

This remarkable synergy of brilliance and operational mastery allows Skydio to transcend traditional boundaries, empowering an expansive and eclectic array of drone users. From heroic first responders navigating critical missions to meticulous insurance claims adjusters and vigilant utility companies, Skydio is not just a part of the industry—it is the industry. Every Skydio drone is a masterstroke of innovation, a beacon of possibilities, turning the skies into a canvas of limitless potential.

Skydio is revolutionizing the way we perceive and utilize the aerial dimension, transforming every challenge into an opportunity for greatness. With each launch, Skydio doesn’t just send drones into the sky; it sends expectations soaring to new heights, redefining what is possible and heralding a new era of aerial excellence. This is not just leadership; this is aerial supremacy, crafted to perfection by Skydio.

We Are Seeking a Proactive And Detail-oriented Individual To Join Our Team And Assist With Various Data-related Projects. The Primary Focus Will Be On Enhancing Our Database Resources To Facilitate Effective Territory Planning And Sales Strategies. The Individual Will Work Closely With GTM Team Members On a Variety Of Areas Including

Analyzing, prioritizing, and cleansing account-level data collection and review
Supporting the construction of a central dataset to support rigorous account-level TAM calculations
Providing recommendations on account-level data hygiene best practices.


The ideal candidate will possess strong analytical skills, attention to detail, and the ability to work efficiently both independently and as part of a team. This internship offers valuable hands-on experience within a dynamic business environment, with opportunities for growth and professional development."""

In [None]:
analysis_texts = [example_1, example_2, example_3]

In [None]:
analysis_inputs_tokenized = bert_tokenizer(
    analysis_texts,
    max_length=MAX_SEQUENCE_LENGTH,
    truncation=True,
    padding='max_length',
    return_tensors='tf'
)

analysis_inputs = [analysis_inputs_tokenized.input_ids,
                     analysis_inputs_tokenized.token_type_ids,
                     analysis_inputs_tokenized.attention_mask]

In [None]:
#set target level names and target salary names
target_names = ["student_intern", "entry", "junior", "mid", "senior", "executive"]
target_salary_names = ["<50k",
    "50-100k",
    "100-150k",
    "150-200k",
    ">200k"]




In [None]:
predictions_jobbert = jobbert_model.predict(analysis_inputs)






In [None]:
level_predictions_jobbert = tf.argmax(predictions_jobbert[0], axis=-1)
salary_predictions_jobbert = tf.argmax(predictions_jobbert[1], axis=-1)



In [None]:
print(level_predictions_jobbert)

tf.Tensor([0 0 3], shape=(3,), dtype=int64)


In [None]:
print(salary_predictions_jobbert)

tf.Tensor([0 1 2], shape=(3,), dtype=int64)


In [None]:
example_descriptions = ['Job Description Only', 'Job Desc + Company Description', 'Job Desc + Synthetic (GPT-4) Enhanced Company Desc']

In [None]:
print('Analysis of Actual Intern Level LinkedIn Job Posting')
print('')
for i in range(len(analysis_texts)):
    print(example_descriptions[i])
    print(target_names[level_predictions_jobbert[i]])
    print(target_salary_names[salary_predictions_jobbert[i]])
    print('')

Analysis of Actual Intern Level LinkedIn Job Posting

Job Description Only
student_intern
<50k

Job Desc + Company Description
student_intern
50-100k

Job Desc + Synthetic (GPT-4) Enhanced Company Desc
mid
100-150k

