## Load libraries

In [1]:
import pandas as pd
import unicodedata

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from tqdm import tqdm

from jobsearch.cleaning import drop_columns, perform_data_casting
from jobsearch.utils import fetch_table_data

from jobsearch.params import DB_PATH, LANG_CLASSIF_MODEL

  from .autonotebook import tqdm as notebook_tqdm
2024-01-13 11:17:06.016490: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-13 11:17:06.016647: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-13 11:17:06.057960: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-13 11:17:06.142518: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
print(LANG_CLASSIF_MODEL)

papluca/xlm-roberta-base-language-detection


## Load data

In [3]:
data = fetch_table_data(DB_PATH)

## Clean data

In [4]:
df = data.copy()
df = drop_columns(df)
df = perform_data_casting(df)

In [9]:
df.description.sample(1000).to_csv('../db/1000_descriptions.csv', index=False)

## Encoding data

In [23]:
descriptions = df.description.to_list()

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection
")

inputs = tokenizer(descriptions,
                   truncation=True,
                   padding=True,
                   max_length=512, #512 tokens would be enough for the model to classify correctly
                   return_tensors="pt")


: 

In [None]:
def tokenizing_data(LANG_CLASSIF_MODEL):
    descriptions = data.description.to_list()

    tokenizer = AutoTokenizer.from_pretrained(LANG_CLASSIF_MODEL)

    inputs = tokenizer(descriptions,
                    truncation=True,
                    padding=True,
                    max_length=512,
                    return_tensors="pt")

## Encogings to torch dataset

In [19]:
class PyTorchEncodedDataset(torch.utils.data.Dataset):

    """
    A custom PyTorch dataset that takes a dictionary of encodings as input and returns a dictionary of PyTorch tensors
    when indexed.
    """

    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}

    def __len__(self):
        return len(self.encodings.input_ids)

torch_descriptions = PyTorchEncodedDataset(inputs)

## Dataloader

In [20]:
# create a data loader with batch size 32
batch_size = 50
dataloader = DataLoader(torch_descriptions, batch_size=batch_size)

## Load model to device

In [21]:
# set up device
device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')

# set up model and move it to device
lg_classifier = AutoModelForSequenceClassification.from_pretrained(LANG_CLASSIF_MODEL)
lg_classifier.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

## Inference

In [22]:
logit_list = []

with torch.no_grad():

  # iterate over dataloader
  for batch in tqdm(dataloader):
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # get logits
    logits = lg_classifier(**batch).logits

    # append list
    logit_list.append(logits)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}
100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


## Post-processing

In [23]:
# get best predicted ids (language)
id_list = []
for batch in logit_list:
  for line in batch:
    id_list.append(line.argmax().item())

# assign ids to pd.Series
id_col = pd.Series(id_list)

# map ids to corresponding labels for easier readability
lang_labels = id_col.map(lg_classifier.config.id2label)

# rename
lang_labels.rename('lang_labels', inplace=True)

lang_labels.value_counts()

fr    139
en     30
it      1
Name: lang_labels, dtype: int64

## Export

In [24]:
# export
lang_labels.to_csv('/content/drive/MyDrive/github/gg_job_search/data/lang_labels', mode='a', index=False, header=False)