# SET-UP FOR QA

## Download libraries

In [None]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Loading libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

import torch
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import pandas as pd
import numpy as np

import unicodedata
import requests
import json
import collections

import random

## Mount Drive

In [1]:
import os
import sys
from google.colab import drive
drive.mount('/content/drive')
package_path = ['/content/drive/MyDrive/Packages', '/content/drive/MyDrive/github/gg_job_search', '/content/drive/MyDrive/github/gg_job_search/src/']
sys.path.extend(package_path)
#!pip install --target=$package_path cupy-cuda102

Mounted at /content/drive


In [18]:
!git config --global user.email "axelruffierdesaimes@gmail.com"
!git config --global user.name "Axelrda"

In [3]:
!git clone https://github.com/Axelrda/gg_job_search

Cloning into 'gg_job_search'...
remote: Enumerating objects: 254, done.[K
remote: Counting objects: 100% (254/254), done.[K
remote: Compressing objects: 100% (161/161), done.[K
remote: Total 254 (delta 128), reused 211 (delta 85), pack-reused 0[K
Receiving objects: 100% (254/254), 4.62 MiB | 9.53 MiB/s, done.
Resolving deltas: 100% (128/128), done.


In [4]:
%cd /content/drive/MyDrive/github/gg_job_search

/content/drive/MyDrive/github/gg_job_search


In [5]:
!git init

Reinitialized existing Git repository in /content/drive/MyDrive/github/gg_job_search/.git/


In [9]:
!git add .

In [14]:
!git commit -m "."

fatal: cannot exec '.git/hooks/post-commit': Permission denied
[master c3d528b] .
 20 files changed, 1803653 insertions(+), 7249 deletions(-)
 create mode 100644 data/best_predicted_answers_df.csv
 create mode 100644 data/dates.csv
 create mode 100644 data/lang_labels
 create mode 100644 data/predicted_answers_df.csv
 create mode 100644 data/qa_pred_salary.csv
 create mode 100644 data/sal_data.csv
 create mode 100644 notebooks/cleaning.ipynb
 create mode 100644 notebooks/exploration.ipynb
 delete mode 100644 notebooks/notebooks_exploration/gg_job_search_prepro.ipynb
 create mode 100644 notebooks/notebooks_exploration/gg_job_search_preprocessing.ipynb
 delete mode 100644 notebooks/notebooks_models/Untitled.ipynb
 delete mode 100644 notebooks/notebooks_models/Untitled1.ipynb
 create mode 100644 notebooks/notebooks_models/extractive_qa_llm.ipynb
 create mode 100644 notebooks/notebooks_models/language_identification_model.ipynb
 delete mode 100644 notebooks/notebooks_models/qa_model_extrac

In [15]:
!git status

On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   notebooks/notebooks_models/extractive_qa_llm.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [21]:
!git push https://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@github.com/Axelrda/gg_job_search

fatal: cannot exec '.git/hooks/pre-push': Permission denied
^C


In [None]:
import preprocessing.preprocess as pp

## Load data

In [None]:
# Get data
data = pd.read_csv('/content/drive/MyDrive/github/gg_job_search/data/gg_job_search_all_RAW.csv')
df = data.copy()

## Basic cleaning + prepro

In [None]:
df = pp.lowercase_and_remove_accents(df)
df = pp.basic_cleaning(df)
df = pp.matching_cols(df)

The extracted salary information out of job descriptions will be used to filter out near-duplicated data in the cleaning notebook (as a conditionnal filtering). To avoid using unnecessary RAM i will perform a basic clean / prepro.

## Concat language labels

In [None]:
# get lang labels 
lang_labels = pd.read_csv('/content/drive/MyDrive/github/gg_job_search/data/lang_labels')

df = pd.concat([df, lang_labels], axis=1)

df.shape

(17189, 15)

## Get french and english records

In [None]:
fr_df = df.copy()[df['lang']== 'fr']
en_df = df.copy()[df['lang']== 'en']

print("Shape of fr_df : ", fr_df.shape)
print("Shape of en_df : ", en_df.shape)

Shape of fr_df :  (15035, 15)
Shape of en_df :  (2129, 15)


In [None]:
fr_df['description'] += ' Pas de salaire précisé' #'salaire nc'
en_df['description'] += ' salary nc' 

## Get last scrapes

In [None]:
def get_last_scrapes(df):

  last_data = df[pd.to_datetime(df.date_time).dt.date == pd.to_datetime(df.date_time).dt.date.max()]

  return last_data


fr_df = get_last_scrapes(fr_df)
en_df = get_last_scrapes(en_df)

## Get contexts, questions, ids

In [None]:
fr_df_contexts = fr_df.description.to_list()
fr_df_questions = ["Quel est le salaire proposé ?"] * len(fr_df)
fr_df_ids = fr_df.index

en_df_contexts = en_df.description.to_list()
en_df_questions = ["What is the proposed salary ?"] * len(en_df)
en_df_ids = en_df.index

## Load tokenizer

In [None]:
model_checkpoint_fr = 'CATIE-AQ/QAmembert'#'etalab-ia/camembert-base-squadFR-fquad-piaf'#'timpal0l/mdeberta-v3-base-squad2'
tokenizer_fr = AutoTokenizer.from_pretrained(model_checkpoint_fr)

model_checkpoint_en = 'deepset/roberta-large-squad2'
tokenizer_en = AutoTokenizer.from_pretrained(model_checkpoint_en)

## Preprocessing for QA

In [None]:
def prepro_qa(questions, contexts, ids, tokenizer):

    max_length = 512
    stride = 128  

    inputs = tokenizer(text=questions, 
                       text_pair=contexts,
                       max_length=max_length,
                       padding="max_length", 
                       truncation=True, 
                       stride=stride, 
                       return_overflowing_tokens=True, 
                       return_offsets_mapping=True)

    sample_map = inputs["overflow_to_sample_mapping"]
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(ids[sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

fr_df_inputs = prepro_qa(fr_df_questions, fr_df_contexts, fr_df_ids, tokenizer_fr)
en_df_inputs = prepro_qa(en_df_questions, en_df_contexts, en_df_ids, tokenizer_en)

# INFERENCE

## Convert encodings to torch tensor

In [None]:
class TorchDataset(torch.utils.data.Dataset):
    
    """
    A custom PyTorch dataset that takes a dictionary of encodings as input and returns a dictionary of PyTorch tensors 
    when indexed.
    """
    
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx): 
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}
    def __len__(self):
        return len(self.encodings.input_ids)

# Instantiate a TorchDataset object with train_encodings as input
torch_fr = TorchDataset(fr_df_inputs)
torch_en = TorchDataset(en_df_inputs)

## Dataloader

In [None]:
# create a data loader 
batch_size = 60

dataloader_fr = DataLoader(torch_fr, batch_size=batch_size)
dataloader_en = DataLoader(torch_en, batch_size=batch_size)

## Load model to device

In [None]:
# Instantiate the pre-trained model
model_fr = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint_fr)
model_en = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint_en)

# Set up device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Set model to device
model_fr.to(device)
model_en.to(device)

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
      

## Predict logits

In [None]:
def get_logits(dataloader, model):

  # Create an empty list to store the outputs
  start_logits_list = []
  end_logits_list = []

  # iterate over the data loader
  with torch.no_grad():
      for batch in tqdm(dataloader):

        # load tensors to device
        batch = {k:v.to(device) for k,v in batch.items()}

        # get prediction
        prediction = model(**batch)

        # Get start/end logits to numpy arrays
        start_logits, end_logits = prediction.start_logits.cpu().numpy(), prediction.end_logits.cpu().numpy()
        
        # append logits
        start_logits_list.append(start_logits)
        end_logits_list.append(end_logits)
    
  # concatenate start and end logits into a single list
  start_logits_concat = np.concatenate(start_logits_list, axis=0)
  end_logits_concat = np.concatenate(end_logits_list, axis=0)

  return start_logits_concat, end_logits_concat
  
start_logits_fr, end_logits_fr = get_logits(dataloader_fr, model_fr)
start_logits_en, end_logits_en = get_logits(dataloader_en, model_en)

100%|██████████| 5/5 [00:08<00:00,  1.74s/it]
100%|██████████| 2/2 [00:07<00:00,  3.90s/it]


## Mapping each example to its corresponding feature

In [None]:
def map_to_feature(df_inputs):

  # Creating an empty defaultdict with a list as the default value
  example_to_features = collections.defaultdict(list)

  for idx, feature in enumerate(df_inputs['example_id']):

    # Append feature indice to each corresponding example
    example_to_features[feature].append(idx)

  return example_to_features


example_to_features_fr = map_to_feature(fr_df_inputs)
example_to_features_en = map_to_feature(en_df_inputs)

## Post-processing 

In [None]:
def post_processing(df_contexts, df_ids, df_inputs, example_to_features, start_logits, end_logits):

  n_best = 20
  max_answer_length = 30

  # list of best predicted answer for each feature
  predicted_answers = []

  for context, example_id in zip(df_contexts, df_ids):
      answers = []

      # get each features index of corresponding example id
      for feature_index in example_to_features[example_id]:
          
          start_logit = start_logits[feature_index]
          end_logit = end_logits[feature_index]

          offsets = df_inputs['offset_mapping'][feature_index]

          start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
          end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()

          for start_index in start_indexes:
              for end_index in end_indexes:

                  # Skip answers that are not fully in the context
                  if offsets[start_index] is None or offsets[end_index] is None:
                      continue
                  # Skip answers with a length that is either < 0 or > max_answer_length.
                  if (
                      end_index < start_index
                      or end_index - start_index + 1 > max_answer_length
                  ):
                      continue                 

                  answers.append(
                      {
                          "context": context,
                          "text": context[offsets[start_index][0] : offsets[end_index][1]],
                          "logit_score": start_logit[start_index] + end_logit[end_index],
                      }
                  )
                  
      try:
          best_answer = max(answers, key=lambda x: x["logit_score"])

      except ValueError:
          print(f"No answers found for example id: {example_id}")
          continue
      
      predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"], "logit_score": best_answer['logit_score'], "context": best_answer['context']})

  return predicted_answers

In [None]:
predicted_answers_fr = post_processing(fr_df_contexts, fr_df_ids, fr_df_inputs, example_to_features_fr, start_logits_fr, end_logits_fr)
predicted_answers_en = post_processing(en_df_contexts, en_df_ids, en_df_inputs, example_to_features_en, start_logits_en, end_logits_en)

## french results

In [None]:
pred_frdf = pd.DataFrame(data=predicted_answers_fr, index=fr_df.index)
pred_frdf

Unnamed: 0,id,prediction_text,logit_score,context
17022,17022,jusqu'a 70k,7.335764,data engineer senior avec une coloration machi...
17023,17023,Pas de salaire précisé,9.602474,publicis groupe recherche ...\n\npublicis sapi...
17024,17024,Pas de salaire précisé,12.005893,les responsabilites de l’equipe sont les suiva...
17026,17026,Pas de salaire précisé,12.526582,data engineer/data scientist – startup innovan...
17027,17027,Pas de salaire,0.883867,softeam est la marque du conseil et des servic...
...,...,...,...,...
17180,17180,jusqu'a 60k,3.181781,data engineer dans une edtech - cdi - paris - ...
17181,17181,600 millions d'euros,3.439713,talan est un cabinet de conseil en innovation ...
17182,17182,Pas de salaire précisé,13.730790,environnement de travail au sein d’une equipe ...
17185,17185,h/f,-6.563137,thales bordeaux recherche un data engineer (h/...


## english results

In [None]:
pred_endf = pd.DataFrame(data=predicted_answers_en, index=en_df.index)
pred_endf

Unnamed: 0,id,prediction_text,logit_score,context
17019,17019,not required,-1.905568,full-time position:\n\nwe are looking for a de...
17020,17020,we are looking\nfor our lead deep learning eng...,1.91596,"about norbert\n\nin a utopian future, our heal..."
17021,17021,paris / engineering / permanent / hybrid salary,-4.461411,"ð« about the team at equativ, we’re on a missi..."
17025,17025,nc,-0.727593,leader in the development and publishing of ga...
17029,17029,competitive salary,4.339551,"research engineer, bioai - machine learning / ..."
17031,17031,salary nc,-4.13254,numberly puts technology to work for brands an...
17032,17032,\n\nautonomous vehicles,-0.331745,job description\n\nbuild your brand. tell your...
17033,17033,"working with a kind, dynamic and diverse team ...",-1.37554,job description\n\nbuild your brand. tell your...
17035,17035,nc,-1.34925,"with 7 million users and an impressive 5,000 n..."
17036,17036,euroclear,-3.187822,division : group digital capabilities (gdc). i...


## Final tweaks

In [None]:
# concat french and english df
pred_df = pd.concat([pred_frdf, pred_endf], axis=0)

# add lang labels to predictions df
pred_df['lang_labels'] = df.loc[pred_df.index, 'lang']

# add dates to predictions df
pred_df = pd.merge(pred_df, df[['schedule_type', 'date_time']], left_index=True, right_index=True)

In [None]:
pred_df

Unnamed: 0,id,prediction_text,logit_score,context,lang_labels,schedule_type,date_time
17022,17022,jusqu'a 70k,7.335764,data engineer senior avec une coloration machi...,fr,a plein temps,2023-05-18 22:27:17.887107
17023,17023,Pas de salaire précisé,9.602474,publicis groupe recherche ...\n\npublicis sapi...,fr,a plein temps,2023-05-18 22:27:17.887107
17024,17024,Pas de salaire précisé,12.005893,les responsabilites de l’equipe sont les suiva...,fr,a plein temps,2023-05-18 22:27:17.887107
17026,17026,Pas de salaire précisé,12.526582,data engineer/data scientist – startup innovan...,fr,a plein temps,2023-05-18 22:27:20.413839
17027,17027,Pas de salaire,0.883867,softeam est la marque du conseil et des servic...,fr,a plein temps,2023-05-18 22:27:20.413839
...,...,...,...,...,...,...,...
17168,17168,"$91,587 to $111,812",0.147631,job description:\ndata engineer\n\nwelcome to ...,en,a plein temps,2023-05-18 22:28:20.808374
17174,17174,salary nc,-2.053096,eb partners aims strengthening his solutions r...,en,a plein temps,2023-05-18 22:28:22.947335
17183,17183,salary nc,-2.119184,about unilever unilever is one of the world`s ...,en,a plein temps,2023-05-18 22:28:30.440350
17184,17184,you will work in multi-disciplinary environmen...,-4.312231,qualifications master’s degree in quantitative...,en,a plein temps,2023-05-18 22:28:30.440350


## Export predictions

In [None]:
pred_df.to_csv('/content/drive/MyDrive/github/gg_job_search/data/qa_pred_salary.csv', index=False, header=False, mode='a')