In [1]:
import pandas as pd
import os
import glob
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
pd.set_option('display.width', 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
skills_df = pd.read_csv('skillgroups_df.csv').fillna('')
skills_df['tk_count'] = skills_df['description'].str.split(' ').str.len()
skills = skills_df['description'].tolist()
# len(skills)
# skills
skills_df['tk_count'].describe()
# plt.plot(skills_df['tk_count'])

count    640.000000
mean      24.910937
std       35.790468
min        1.000000
25%        9.000000
50%       15.000000
75%       23.000000
max      346.000000
Name: tk_count, dtype: float64

In [4]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
# sentences = skills

def get_embeds(sentences):

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert-base-cased")
    model = AutoModel.from_pretrained("jjzha/jobbert-base-cased")

    # Tokenize sentences
    encoded_input = tokenizer(sentences, max_length=512, padding=True, truncation=True, return_tensors='pt')
    # print(encoded_input)
    # print(encoded_input['input_ids'].shape)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

print("Sentence embeddings:")
print(get_embeds(skills))


Sentence embeddings:


Some weights of the model checkpoint at jjzha/jobbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.

KeyboardInterrupt: 

In [None]:
len(skills)
skill_embeds = get_embeds(skills)

torch.Size([2279, 768])

In [None]:
def get_cos_sim(a, b) -> float:
    return np.dot(a, b) / (norm(a) * norm(b))

In [None]:
# job = ['Operating and monitoring production machinery', 
#         'Maintaining equipment and machinery in good working order',
#         'Ensuring compliance with safety regulations',
#         'Keeping accurate records of production output',
#         'Troubleshooting and resolving equipment and machinery issues',
#         'High school diploma or equivalent',
#         '1-2 years of experience in a manufacturing environment',
#         'Ability to operate machinery and equipment',
#         'Strong attention to detail and problem-solving skills',
#         'Physical ability to lift up to 50 pounds and stand for long periods of time',
#         'Strong communication and teamwork skills']

job = ["We are looking for an experienced software engineer to join our team.",
        "The ideal candidate will have experience with Python, JavaScript, and React.",
        "They will also have experience with AWS and be familiar with agile development methodologies."]

job_embeds = get_embeds(job)

Some weights of the model checkpoint at jjzha/jobbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.we

In [None]:
sent_sim = dict()
for i, sent in enumerate(job_embeds):
    sk = []
    for _, skill in enumerate(sentence_embeddings):
        sim = get_cos_sim(sent,skill)
        sk.append(sim)
        # print(sim)
    max_sim = np.where(np.array([0 if x <= 0.75 else 1 for x in sk])==1)[0]
    sent_sim[i] = [skills[i] for i in max_sim]
print(len(sk))
# print(sk.sort(reverse=True))
print(sent_sim)

2279
{0: [], 1: ['Cain and Abel (penetration testing tool)', 'Java (computer programming)', 'decentralized application frameworks', 'JavaScript', 'John The Ripper (penetration testing tool)', 'use back-up and recovery tools', 'Microsoft Visual C++', 'JavaScript Framework', 'Python (computer programming)', 'Pascal (computer programming)', 'Jenkins (tools for software configuration management)'], 2: ['Cain and Abel (penetration testing tool)', 'transportation software related to an ERP system', 'Agile project management', 'use ICT resources to solve work related tasks', 'solve location and navigation problems by using GPS tools', 'use methods of logistical data analysis', 'John The Ripper (penetration testing tool)', 'use back-up and recovery tools', 'ICT project management methodologies', 'guide learners in using assistive technologies', 'apply operations for an ITIL-based environment', 'Agile development', 'perform a feasibility study for building management systems', 'oversee wetlands

In [None]:
job

['Operating and monitoring production machinery',
 'Maintaining equipment and machinery in good working order',
 'Ensuring compliance with safety regulations',
 'Keeping accurate records of production output',
 'Troubleshooting and resolving equipment and machinery issues',
 'High school diploma or equivalent',
 '1-2 years of experience in a manufacturing environment',
 'Ability to operate machinery and equipment',
 'Strong attention to detail and problem-solving skills',
 'Physical ability to lift up to 50 pounds and stand for long periods of time',
 'Strong communication and teamwork skills']