In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

## Load libraries

In [2]:
import pandas as pd
import unicodedata

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

## Mount Drive

In [3]:
import os
import sys
from google.colab import drive
drive.mount('/content/drive')
package_path = ['/content/drive/MyDrive/Packages', '/content/drive/MyDrive/github/gg_job_search', '/content/drive/MyDrive/github/gg_job_search/src/']
sys.path.extend(package_path)
#!pip install --target=$package_path cupy-cuda102

Mounted at /content/drive


In [4]:
# custom functions
import preprocessing.preprocess  as pp

## Load data

In [15]:
# Get data
data = pd.read_csv('/content/drive/MyDrive/github/gg_job_search/data/gg_job_search_all_RAW.csv')
df = data.copy()

## Preparing data

In [16]:
df = pp.lowercase_and_remove_accents(df)
df = pp.basic_cleaning(df)
df = pp.matching_cols(df)

## Get last scraped data

In [17]:
df = pp.get_last_records(df)

## Encoding data

In [18]:
descriptions = df.description.to_list()

model_checkpoint = 'papluca/xlm-roberta-base-language-detection'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(descriptions, 
                   truncation=True, 
                   padding=True,
                   max_length=512, 
                   return_tensors="pt")

## Encogings to torch dataset

In [19]:
class PyTorchEncodedDataset(torch.utils.data.Dataset):
    
    """
    A custom PyTorch dataset that takes a dictionary of encodings as input and returns a dictionary of PyTorch tensors 
    when indexed.
    """
    
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx): 
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}

    def __len__(self):
        return len(self.encodings.input_ids)

torch_descriptions = PyTorchEncodedDataset(inputs)

## Dataloader

In [20]:
# create a data loader with batch size 32
batch_size = 50
dataloader = DataLoader(torch_descriptions, batch_size=batch_size)

## Load model to device

In [21]:
# set up device
device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')

# set up model and move it to device
lg_classifier = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
lg_classifier.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

## Inference

In [22]:
logit_list = []

with torch.no_grad():

  # iterate over dataloader
  for batch in tqdm(dataloader):
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # get logits
    logits = lg_classifier(**batch).logits

    # append list 
    logit_list.append(logits)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}
100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


## Post-processing

In [23]:
# get best predicted ids (language)
id_list = []
for batch in logit_list:
  for line in batch:
    id_list.append(line.argmax().item())

# assign ids to pd.Series 
id_col = pd.Series(id_list)

# map ids to corresponding labels for easier readability
lang_labels = id_col.map(lg_classifier.config.id2label)

# rename 
lang_labels.rename('lang_labels', inplace=True)

lang_labels.value_counts()

fr    139
en     30
it      1
Name: lang_labels, dtype: int64

## Export

In [24]:
# export
lang_labels.to_csv('/content/drive/MyDrive/github/gg_job_search/data/lang_labels', mode='a', index=False, header=False)