# Fine-tuning a question-answering model to extract salary information in job postings descriptions

## Loading libraries & data

In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from tqdm import tqdm

import pandas as pd
import numpy as np

import unicodedata
import requests
import json

import random

path = 'https://media.githubusercontent.com/media/Axelrda/gg_job_search/master/data/gg_job_search_all_RAW.csv'

data = pd.read_csv(path)#'/home/axel/ds_projects/projects/gg_job_search/data/gg_job_search_all_RAW.csv')


In [None]:
data

## Removing duplicates & null values

In [2]:
data.drop_duplicates(['description'], inplace=True)
data.drop_duplicates(['job_id'], inplace=True)
data.reset_index(inplace=True)

data.description.dropna(axis=0, inplace=True)
data.drop(axis=0, index=221, inplace=True)

## Quick text preprocessing

In [3]:
# lowering cases
data.description = data.description.apply(lambda x: x.lower())

# decode data (removing accents ... )
def remove_accents(text):
    normalized = unicodedata.normalize('NFKD', text)
    without_accents = [c for c in normalized if not unicodedata.combining(c)]
    return ''.join(without_accents)
data.description = data.description.apply(lambda x: remove_accents(x))


## Selecting rows containing salary information in description

create a labeled dataset for qa model fine-tuning / get rows w/ salary information / filter out rows w/o salary info

In [4]:
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 50

# several mask to filter rows with info of interest
salaire_word = ~data.description.str.contains('salaire')
remuneration_word = ~data.description.str.contains('remuneration')
jusqua_word = data.description.str.contains(r"[0-9]k")
stop_words = ~data.description.str.contains('selon experience|salaire fixe|remuneration en fonction|remuneration attractive|selon conditions|grille de salaire|a negocier|selon votre experience|remuneration fixe|selon profil|selon votre profil')

#data.description.loc[(salaire_word | remuneration_word |jusqua_word ) ]
#data.description.loc[jusqua_word  & salaire_word & remuneration_word ]
#indew_w_salary_word = set(data.description.loc[(salaire_word | remuneration_word) & stop_words].index)
#len(set(index_w_salary_data).intersection(indew_w_salary_word))

After thorough investigation, i've come up with a small batch of 159 rows containing salary information. 

In [None]:
index_w_salary_data = [25, 58, 75, 129, 216, 218, 219, 220, 237, 278, 285, 424, 503, 508, 510, 512, 518, 528, 537, 547, 550, 560, 565, 598, 602, 621, 
                       671, 673, 682, 709, 742, 744, 745, 764, 808, 812, 857, 871, 873, 876, 940, 954, 970, 1007, 1013, 1030, 1040, 1048, 1080, 1139, 
                       1140, 1146, 1240, 1255, 1269, 1325, 1331, 1334, 1337, 1348, 1397, 1413, 1415, 1473, 1477, 1480, 1486, 1488, 1536, 1562, 1565, 
                       1577, 1599, 1651, 1681, 1691, 1721, 1867, 2027, 2030, 2053, 2066, 2215, 2273, 2360, 2380, 2389, 2399, 2460, 2479, 2481, 2486, 
                       2491, 2508, 2514, 2516, 2517, 2529, 2545, 2591, 2620, 2700, 2709, 2710, 2719, 2776, 2789, 2805, 2809, 2814, 2829, 2838, 2853, 
                       2870, 2871, 2892, 2925, 2927, 2937, 2947, 2999, 3003, 3062, 3071, 3074, 3113, 3164, 3197, 3202, 3258, 3261, 3291, 3295, 3330, 
                       3331, 3339, 3342, 3348, 3376, 3385, 3388, 3457, 3486, 3597, 3611, 3638, 3687, 3798, 3834, 3843, 3864, 3901, 3908, 3912, 3922, 
                       3978, 3994, 4019, 4024]

descriptions_w_salary = data.loc[index_w_salary_data,['description']]

## Exporting selected rows for further labeling

export to csv / haystack / labeling / squad like format (start/ end pos, ids, ...) / json file / additional processing neat predictions

In [None]:
labels = ["70000€ par an","50000€ a 65000€ par an","40000€ a 65000€ par an","35000€ a 38000€ par an","45000€ par an","45000€ par an","40000€ a 55000€ par an","34000€ a 50000€ par an","45000€ a 55000€ par an",
"2104€ par mois","70000€ par an","70000€ par an","950€ a 1400€ par mois","40000€ a 45000€ par an","70000€ par an","90000€ par an","50000€ a 70000€ par an","48000€ a 52000€ par an","55000€ a 65000€ par an",
"65000€ par an","40000€ a 45000€ par an","40000€ a 45000€ par an","60000€ par an","35000€ a 45000€ par an","35000€ a 45000€ par an","1000€ par mois","60000€ a 90000€ par an","60000€ par an","50000€ a 70000€ par an",
"50000€ a 65000€ par an","45000€ a 60000€ par an","99€ a 999€ par mois","1160€ a 1500€ par mois","85000€ par an","60000€ a 80000€ par an","75000€ a 90000€ par an","50000€ a 60000€ par an","32000€ a 37000€ par an",
"35000€ a 40000€ par an","1709.28€ par mois","3000€ par mois","1000€ par mois","2304€ par mois","50000€ a 65000€ par an","55000€ a 65000€ par an","65000€ a 80000€ par an","80000€ a 90000€ par an","85000€ par an",
"55000€ a 65000€ par an","600€ a 1200€ par mois","45000€ par an","20000€ a 22000€ par an","60000€ par an","33336.19€ a 57356.85€ par an","40000€ a 50000€ par an","2289€ a 3750€ par mois","1200€ par mois","60000€ a 65000€ par an",
"50000€ a 65000€ par an","50000€ a 57000€ par an","2292€ a 4774€ par mois","36000€ par an","1795€ par mois","58000€ par an","32000€ par an","500€ a 570€ par jour","70000€ par an","45000€ par an","45000€ par an","2294€ a 4 847€ par mois",
"30000€ a 33000€ par an","50000€ par an","500€ a 570€ par jour","50000€ par an","40000€ a 70000€ par an","25000€ a 45000€ par an","50000€ a 70000€ par an","45000€ a 55000€ par an","45000€ a 55000€ par an","50000€ a 57000€ par an",
"52000€ a 80000€ par an","60000€ par an","1200€ par mois","39000€ a 55000€ par an","50000€ a 60000€ par an","40000€ a 60000€ par an","45000€ a 57000€ par an","38000€ a 45000€ par an","1000€ par mois","45000€ a 55000€ par an",
"3000€ par mois","35000€ a 60000€ par an","30000€ par an","43000€ par an","1709.28€ par mois","45000€ a 55000€ par an","38000€ a 45000€ par an","100000€ par an","62000€ a 68000€ par an","2785.91€ par mois","43000€ par an",
"38000€ par an","45000€ a 60000€ par an","700€ par mois","22000€ par an","40000€ a 45000€ par an","35000€ a 45000€ par an","35000€ a 37000€ par an","60000€ par an","45000€ a 50000€ par an","50000€ a 63000€ par an","45000€ a 60000€ par an",
"60000€ par an","40000€ a 45000€ par an","55000€ a 65000€ par an","1100€ par mois","734.99€ par mois","45000€ a 55000€ par an","40000€ a 48000€ par an","55000€ a 65000€ par an","45000€ par an","45000€ a 55000€ par an","35000€ par an",
"4000€ a 7000€ par mois","25000€ a 27000€ par an","45000€ a 55000€ par an","40000€ a 50000€ par an","40000€ a 60000€ par an","40000€ a 70000€ par an","50000€ a 60000€ par an","23218.75€ a 46342.92€ par an","75000€ a 90000€ par an",
"4.05€ par heure","60000€ a 90000€ par an","35000€ a 45000€ par an","38000€ a 48000€ par an","75000€ a 90000€ par an","800€ a 950€ par mois","40000€ a 55000€ par an","34000€ par an","3000€ a 4 500€ par mois","40000€ a 75000€ par an",
"2241.51€ par mois","45000€ a 60000€ par an","36000€ par an","50000€ a 70000€ par an","70000€ par an","36000€ par an","50000€ a 65000€ par an","40000€ a 62000€ par an","50000€ par an","2000€ par mois","35000€ a 60000€ par an",
"50000€ par an","21000€ par an","50000€ a 70000€ par an","50000€ par an","65000€ a 80000€ par an","48000€ a 57000€ par an"]

# replace answers annotated with haystack with processed & normalized labels (by hand + ChatGPT)
#for i in range(159):
    
    #df['data'][i]['paragraphs'][0]['qas'][0]['answers'][0]['text'] = labels[i]
    #print(answer['data'][i]['paragraphs'][0]['qas'][0]['answers'][0]['text'])

In [5]:
def read_train_test_split_df(url):
    
    # get labeled data from github url
    response = requests.get(url)
    df = json.loads(response.text)

    # Shuffle the list of dictionaries randomly
    random.shuffle(df['data'])

    # Determine the sizes of the subsets
    train_size = int(0.8 * len(df['data']))
    val_size = int(0.1 * len(df['data']))
    test_size = len(df['data']) - train_size - val_size

    # Slice the shuffled list into subsets
    train_data = df['data'][:train_size]
    val_data = df['data'][train_size:train_size+val_size]
    test_data = df['data'][train_size+val_size:]
    
    return train_data, val_data, test_data

url = 'https://raw.githubusercontent.com/Axelrda/gg_job_search/master/data/labeled_data_qa_fine-tuning.json'
train_data, val_data, test_data = read_train_test_split_df(url)

# Print the number of examples in each subset
print(f"Number of examples in training set: {len(train_data)}")
print(f"Number of examples in validation set: {len(val_data)}")
print(f"Number of examples in testing set: {len(test_data)}")

Number of examples in training set: 127
Number of examples in validation set: 15
Number of examples in testing set: 17


In [6]:
def extract_context_question_answer(df):
    '''Extract context, question and answer from SQUaD dataset and put it in list format'''
    
    contexts = []
    answers = []
    questions = []

    for group in df:
        for passage in group['paragraphs']:
            contexts.append(passage['context'])
            for qas in passage['qas']:
                questions.append(qas['question'])
                for qa in qas['answers']:
                    del qa['question_id']
                    del qa['answer_id']
                    del qa['document_id']
                    del qa['answer_category']

                    answers.append(qa)
                    
    return contexts, questions, answers

train_contexts, train_questions, train_answers = extract_context_question_answer(train_data)
val_contexts, val_questions, val_answers = extract_context_question_answer(val_data)
test_contexts, test_questions, test_answers = extract_context_question_answer(test_data)

## Tokenizer / Encode

In [7]:
model_checkpoint = 'CATIE-AQ/QAmembert'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


## Mapping characters positions in original context to tokens positions in tokenized input

get each token answer postion for each feature if no answer inside a given feature, return (0,0) else return (token_answer_start_position, token_answer_end_position)

In [8]:
max_length = 512
stride = 128

def prepro_qa_examples(questions, contexts, answers):
    inputs = tokenizer(text=questions, 
                       text_pair=contexts,
                       max_length=max_length,
                       padding="max_length", 
                       truncation=True, 
                       stride=stride, 
                       return_overflowing_tokens=True, 
                       return_offsets_mapping=True)

    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(inputs['offset_mapping']):
        
        # Get the index of the corresponding example
        sample_idx = inputs["overflow_to_sample_mapping"][i]
        
        # Get the answer for the current example
        answer = answers[sample_idx]
        
        # Get the start and end character positions of the answer in the context
        start_char = answer["answer_start"]
        end_char = answer["answer_end"]
        
        #Get the sequence ids of the tokens in the input
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)

        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    # Add the start and end positions to the inputs dictionary and return it
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

train_inputs = prepro_qa_examples(train_questions, train_contexts, train_answers)


## Creating a DataLoader object from a custom PyTorch Dataset

A dataloader allows to iterate through a dataset in an organized and efficient manner. It significantly speed up the training process and make better use of the available resources (e.g. GPU memory) by batching, shuffling and pre-fetching the data.

My training data being relatively small, it'll allow me to apply data augmentation to improve the performance of the model.

In [10]:
class SquadDataset(torch.utils.data.Dataset):
    
    """
    A custom PyTorch dataset that takes a dictionary of encodings as input and returns a dictionary of PyTorch tensors 
    when indexed.
    """
    
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

# Instantiate a SquadDataset object with train_encodings as input
train_dataset = SquadDataset(train_inputs)

# Define a DataLoader for the train_dataset
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

## Model 

In [11]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

## Fine-tuning

In [None]:
device = torch.device('cpu')
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
for epoch in range(3):
    loop = tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        offset_mapping = batch['offset_mapping'].to(device)
        overflow_to_sample_mapping = batch['overflow_to_sample_mapping'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(inputs_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = output[0]
        loss.back

idx = 7
sample_idx = inputs['overflow_to_sample_mapping'][idx]
answer = answers[sample_idx]["text"]
start = inputs["start_positions"][idx]
end = inputs["end_positions"][idx]

labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")



In [None]:
print('input_ids :',len(inputs['input_ids'][0]))
print('overflow to sample mapping :', inputs['overflow_to_sample_mapping'])
print('offset mapping :', len(inputs['offset_mapping']))
print('offset mapping :', inputs['offset_mapping'][0])