## Load libraries

In [54]:
import pandas as pd
import unicodedata

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from tqdm import tqdm

from jobsearch.cleaning import drop_columns, perform_data_casting
from jobsearch.utils import get_dataframe_memory_usage
from jobsearch.database import *
from jobsearch.ml_logic.lang_classifier import *
from jobsearch.params import DB_PATH, LANG_CLASSIF_MODEL

## Load data

In [55]:
import os

# Retrieve each component of the PostgreSQL connection string from environment variables
POSTGRES_USER = os.environ.get('POSTGRES_USER', 'default_user')
POSTGRES_PASSWORD = os.environ.get('POSTGRES_PASSWORD', 'default_password')
POSTGRES_GCS_HOST = os.environ.get('POSTGRES_GCS_HOST', 'localhost')
POSTGRES_PORT = os.environ.get('POSTGRES_PORT', '5432')
POSTGRES_DATABASE = os.environ.get('POSTGRES_DATABASE', 'default_database')

# Construct the db_path string
db_path = f'postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_GCS_HOST}:{POSTGRES_PORT}/{POSTGRES_DATABASE}'
engine = create_engine(db_path)
data = fetch_data_from_postgresql(import_from_cloud=True)

#data = fetch_data_from_postgresql(import_from_cloud=True)
#data = pd.read_csv("../db/jobs_02-02-2024.csv")


Using CLOUD postgres database ...


ERROR:root:An error occurred: Not an executable object: 'SELECT * FROM raw_data'


## Clean data

In [3]:
df = data.copy()

get_dataframe_memory_usage(df).sum()

262.97

## Encoding data

In [15]:
model_path = "../models/xlm-roberta-base-language-detection/"
LANG_CLASSIF_MODEL = "../models/xlm-roberta-base-language-detection/"

In [5]:
n = 1000
df = df.sample(n)

descriptions = df.description.to_list()

tokenizer = AutoTokenizer.from_pretrained(LANG_CLASSIF_MODEL)

inputs = tokenizer(descriptions,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors="pt")

## Encogings to torch dataset

In [6]:

class PyTorchEncodedDataset(torch.utils.data.Dataset):

    """
    A custom PyTorch dataset that takes a dictionary of encodings as input and returns a dictionary of PyTorch tensors
    when indexed.
    """

    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}

    def __len__(self):
        return len(self.encodings.input_ids)

torch_descriptions = PyTorchEncodedDataset(inputs)

## Dataloader

In [8]:
# create a data loader with batch size 16
batch_size = 16
dataloader = DataLoader(torch_descriptions, batch_size=batch_size)

## Load model to device

In [10]:
# set up device
device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')

# set up model and move it to device
lg_classifier = AutoModelForSequenceClassification.from_pretrained(LANG_CLASSIF_MODEL)
lg_classifier.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

## Inference

In [11]:
logit_list = []

with torch.no_grad():

  # iterate over dataloader
  for batch in tqdm(dataloader):
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # get logits
    logits = lg_classifier(**batch).logits

    # append list
    logit_list.append(logits)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key in ['input_ids', 'attention_mask']}
100%|██████████| 20/20 [00:26<00:00,  1.34s/it]


## Post-processing

In [13]:
# get best predicted ids (language)
id_list = []
for batch in logit_list:
  for line in batch:
    id_list.append(line.argmax().item())

# assign ids to pd.Series
id_col = pd.Series(id_list)

# map ids to corresponding labels for easier readability
lang_labels = id_col.map(lg_classifier.config.id2label)

# rename
lang_labels.rename('lang_labels', inplace=True)

lang_labels.value_counts()

fr    859
en    140
de      1
Name: lang_labels, dtype: int64

In [46]:
df = df.reset_index()
df['lang'] = lang_labels
df.lang.value_counts()

fr    872
en    125
es      1
hi      1
de      1
Name: lang, dtype: int64

## Export

In [24]:
# export
lang_labels.to_csv('/content/drive/MyDrive/github/gg_job_search/data/lang_labels', mode='a', index=False, header=False)