DistilBert Tokenization

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.0 MB/s[0m eta [36m0:00:0

In [2]:
import re
import numpy as np
import pandas as pd

In [3]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel

In [4]:
import torch

In [5]:
use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'

In [6]:
print(device)

cuda


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
def tokenize(text, tokenizer, max_length):
  tokens = tokenizer(text, max_length = max_length, padding = "max_length", truncation=True, return_attention_mask=True, return_token_type_ids=False) #, return_tensors = 'pt')
  return tokens

In [19]:
def get_embeddings_and_masks(tokens, model):
  final_embeddings = []

  for token in tokens:
    token_ids = torch.tensor(token['input_ids']).unsqueeze(0)
    token_ids = token_ids.to(device) #
    attention_masks = torch.tensor(token['attention_mask']).unsqueeze(0)
    attention_masks = attention_masks.to(device) #

    with torch.no_grad():
      output = model(token_ids, attention_mask = attention_masks)

    embedding = output.last_hidden_state

    attention_masks = attention_masks.cpu() #
    embedding = embedding.cpu() #

    mask = attention_masks.unsqueeze(-1).expand(embedding.shape).float()
    masked_embedding = embedding * mask

    summed = torch.sum(masked_embedding, 1)
    counts = torch.clamp(mask.sum(1), min = 1e-9)

    mean_pooled_embedding = summed / counts

    final_embeddings.append(mean_pooled_embedding)

  return final_embeddings

In [20]:
def save_vectors(embeddings, path):
  torch.save(embeddings, path)

In [12]:
base_folder = "/content/drive/MyDrive/Projects/Resume_Matching"
resume_all_path = base_folder + "/Resumes_Extracted_Sectionized_False.csv"
resume_sectionized_path = base_folder + "/Resumes_Extracted_Sectionized_True.csv"
selected_jds_path = base_folder + "/job_descriptions_selected_cleaned.csv"

resume_all_data = pd.read_csv(resume_all_path)
resume_sectionized_data = pd.read_csv(resume_sectionized_path)
jd_data = pd.read_csv(selected_jds_path)

In [13]:
jd_data = jd_data.drop(['Unnamed: 0'], axis=1)
resume_all_data = resume_all_data.drop(['Unnamed: 0'], axis=1)
resume_sectionized_data = resume_sectionized_data.drop(['Unnamed: 0'], axis=1)

In [14]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [15]:
max_length = 128

Resume Tokens

In [16]:
tokens_resume_all = []
cols = ["cleaned_text"]
df = resume_all_data
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_resume_all.append(tokens)
path = base_folder + "/Resumes_all_vectors.pt"

In [21]:
embeddings = get_embeddings_and_masks(tokens_resume_all, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

In [29]:
tokens_resume_title_skills_edu = []
cols = ["title", "education", "skills"]
df = resume_sectionized_data
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_resume_title_skills_edu.append(tokens)
path = base_folder + "/Resumes_title_skills_edu_vectors.pt"

In [30]:
embeddings = get_embeddings_and_masks(tokens_resume_title_skills_edu, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

JD Tokens

In [31]:
tokens_jd_sans_exp_desc = []
cols = ["Title", "Core Responsibilities", "Educational Requirements", "Required Skills"]
df = jd_data
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_jd_sans_exp_desc.append(tokens)
path = base_folder + "/JDs_sans_exp_desc_vectors.pt"

In [32]:
embeddings = get_embeddings_and_masks(tokens_jd_sans_exp_desc, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

In [33]:
tokens_jd_with_desc_sans_exp = []
cols = ["Job Description", "Title", "Core Responsibilities", "Educational Requirements", "Required Skills"]
df = jd_data
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_jd_with_desc_sans_exp.append(tokens)
path = base_folder + "/JDs_with_desc_sans_exp_desc_vectors.pt"

In [34]:
embeddings = get_embeddings_and_masks(tokens_jd_with_desc_sans_exp, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

In [35]:
tokens_jd_title_skills_edu = []
cols = ["Title", "Educational Requirements", "Required Skills"]
df = jd_data
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_jd_title_skills_edu.append(tokens)
path = base_folder + "/JDs_title_skills_edu_desc_vectors.pt"

In [36]:
embeddings = get_embeddings_and_masks(tokens_jd_title_skills_edu, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

In [37]:
tokens_jd_all = []
df = jd_data
cols = list(df.columns)
for i in range(len(df)):
    s = ""
    for col in cols:
        if str(df[col][i]) != "nan":
            s += str(df[col][i])
        s += ' '
    s = s[:-1]
    tokens = tokenize(s, tokenizer, max_length)
    tokens_jd_all.append(tokens)
path = base_folder + "/JDs_all_desc_vectors.pt"

In [38]:
embeddings = get_embeddings_and_masks(tokens_jd_all, model)
embeddings = torch.vstack(embeddings)

save_vectors(embeddings, path)

##

In [9]:
def resume_cleaning(text, tokenizer, max_length):
  cleaned_text = re.sub('[^a-zA-Z]', ' ', text)
  cleaned_text = re.sub(r'[^\w\s]|_', ' ', cleaned_text)
  cleaned_text = re.sub(r'\d+', ' ', cleaned_text)
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
  cleaned_text=re.sub('http\S+\s', " ", cleaned_text)
  cleaned_text = re.sub(r'[^\x00-\x7f]',r' ', cleaned_text)

  # Convert to lowercase
  cleaned_text = cleaned_text.lower()

  # Remove Extra Words
  extra_words = ['company', 'name', 'city', 'state', 'work', 'profession', 'professional', 'detail', 'details', 'profile', 'summary', 'highly', 'very', 'education', 'educational']
  words = cleaned_text.split()
  filtered_words = [word for word in words if word not in extra_words]

  filtered_text = ' '.join(filtered_words)

  tokens = tokenizer(filtered_text, max_length = max_length, padding = "max_length", truncation=True, return_attention_mask=True, return_token_type_ids=False) #, return_tensors = 'pt')
  return tokens #tokens['input_ids'], tokens['attention_mask']

In [10]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
#for layer in model.layers:
#    layer.trainable = False
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [11]:
max_token_length = 128
resume_data = pd.read_csv("/content/drive/MyDrive/Projects/Resume_Matching/Resume.csv")

In [12]:
token = resume_cleaning(resume_data["Resume_str"][0], tokenizer, max_token_length)
token_ids = torch.tensor(token['input_ids']).unsqueeze(0)
token_ids = token_ids.to(device) #

attention_masks = torch.tensor(token['attention_mask']).unsqueeze(0)
attention_masks = attention_masks.to(device) #

In [13]:
token_ids.shape

torch.Size([1, 128])

In [14]:
attention_masks.shape

torch.Size([1, 128])

In [15]:
output = model(token_ids, attention_mask = attention_masks)
embedding = output.last_hidden_state
attention_masks = attention_masks.cpu() #
embedding = embedding.cpu()

embedding.shape

torch.Size([1, 128, 768])

In [19]:
print(embedding.is_cuda)
print(attention_masks.is_cuda)

False
False


In [18]:
embedding = embedding.cpu()

In [20]:
mask = attention_masks.unsqueeze(-1).expand(embedding.shape).float()
masked_embedding = embedding * mask
summed = torch.sum(masked_embedding, 1)
counts = torch.clamp(mask.sum(1), min = 1e-9)

In [36]:
print(masked_embedding.shape)

torch.Size([1, 128, 768])


In [21]:
mean_pooled = summed / counts

In [22]:
print(mean_pooled.shape)

torch.Size([1, 768])
