In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# pip install transformers
!pip install transformers==4.11.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.11.3
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0


In [None]:
pip install -U sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [None]:
import torch
from torch.utils import data as t_data
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from multiprocessing.pool import ThreadPool

In [None]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
#import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import json
import jsonlines
from collections import Counter
from tqdm.notebook import tqdm

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet', 'omw-1.4'])

#warning
import warnings 
#from PyPDF2 import PdfFileReader
from os.path import join
from tqdm.notebook import tqdm
import os
tqdm.pandas()

warnings.filterwarnings('ignore')

ACC_DIR = 'data/transformed_data/acceptability'
SKILL_OUT = 'data/transformed_data/extracted_skills'


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Data

In [None]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

## Translation

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class MtTransformers():
    def __init__(self, src_model_name='facebook/wmt19-en-de', silence=True, batch_size=32, max_length=None):
        self.device = device
        self.src_model_name = src_model_name
        self.src_model = AutoModelForSeq2SeqLM.from_pretrained(self.src_model_name)
        self.src_model.eval()
        self.src_model.to(device)
        self.src_tokenizer = AutoTokenizer.from_pretrained(self.src_model_name)

        self.batch_size = batch_size
        self.max_length = max_length

    def get_device(self):
        return str(self.src_model.device)

    def predict(self, texts, target_words=None, n=1):
        src_translated_texts = self.translate_one_step_batched(texts, self.src_tokenizer, self.src_model)
        return src_translated_texts

    def translate_one_step_batched(
            self, data, tokenizer, model
    ):
        tokenized_texts = tokenizer(data, padding=True, truncation=True, return_tensors='pt')
        tokenized_dataset = t_data.TensorDataset(*(tokenized_texts.values()))
        tokenized_dataloader = t_data.DataLoader(
            tokenized_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=1
        )

        all_translated_ids = []
        with torch.no_grad():
            for batch in tokenized_dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, attention_mask = batch

                translated_ids_batch = model.generate(
                    input_ids=input_ids, attention_mask=attention_mask,
                    max_length=self.max_length
                )

                all_translated_ids.append(
                    translated_ids_batch.detach().cpu().numpy()
                )

        all_translated_texts = []
        for translated_ids_batch in all_translated_ids:
            translated_texts = tokenizer.batch_decode(
                translated_ids_batch,
                skip_special_tokens=True
            )
            all_translated_texts.extend(translated_texts)

        return all_translated_texts 

## English to french test

In [None]:
# trans_model = MtTransformers(src_model_name='Helsinki-NLP/opus-mt-en-fr', silence=True, batch_size=32, max_length=None)

In [None]:
# test_sentences = [
#     'hello how are you', 'all good in the hood'
# ]
# trans_model.predict(test_sentences)

## French to english test
Other options:

Helsinki-NLP/opus-mt-tc-big-fr-en


In [None]:
trans_model = MtTransformers(src_model_name='Helsinki-NLP/opus-mt-fr-en', silence=True, batch_size=100, max_length=None)

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

In [None]:
test_sentences = ['Bonjour, comment allez-vous?', 'Tout bon dans le capot']
trans_model.predict(test_sentences)

['Hello, how are you?', 'All right in the hood']

## Translating Skills

In [None]:
# id
def translate_skills(df, file_name, id_col = 'job_offer_id'):
  if os.path.exists(file_name):
    exist_df = pd.DataFrame(read_json_lines(file_name))
    print('all records: ', df.shape[0])
    df = df[~df[id_col].isin(exist_df['id'])]
    print('rest of records: ', df.shape[0])
  for ind, row in tqdm(df.iterrows(), total=df.shape[0]):
    if len(row['skills'])>0:
      skills = row['skills'].split('|')
      rows = {}
      rows['id'] = row[id_col]
      rows['en_skills'] = trans_model.predict(skills)
      write_json_lines(file_name, rows)
   #row['skills_en'] = en_skills
   #print(en_skills)
#skills_english.append('|'.join(en_skills))
#df['skills_en'] = skills_english
def process_skills(file_name, pbar):
  def process_skills_(rec):
    if len(rec['skills'])>0:
      skills = rec['skills'].split('|')
      rows = {}
      rows['id'] = rec['job_offer_id']
      rows['en_skills'] = trans_model.predict(skills)
      pbar.update()
      write_json_lines(file_name, rows)
  return process_skills_
def translate_skills_parallel(df, file_name):
    if os.path.exists(file_name):
      exist_df = pd.DataFrame(read_json_lines(file_name))
      df = df[~df['job_offer_id'].isin(exist_df['id'])]
    pbar = tqdm(total = df.shape[0])
    p_skills = process_skills(file_name, pbar)
    p = ThreadPool(10)
    pool_output = p.map(p_skills, df.to_dict('records'))
    # 122555 
    # 59735

## Job Offers

### Extracted Using ESCO Rules

In [None]:
# df = pd.DataFrame(read_json_lines(join(SKILL_OUT, 'fr_extracted_skills_v3.jsonl')))

In [None]:
#translate_skills(df, join(ACC_DIR, 'en_skills_ESCO_transformers.jsonl'))
# translate_skills_parallel(df, join(ACC_DIR, 'en_skills_ESCO_transformers.jsonl'))

### Extracted Using Pole Emploi Rules

In [None]:
# df_pe = pd.DataFrame(read_json_lines(join(SKILL_OUT, 'extracted_skills_pole_emploi_v2.jsonl')))

In [None]:
# translate_skills(df_pe, join(ACC_DIR, 'en_skills_PE_transformers.jsonl'))

## Coursera Course Skills

### Extracted Using ESCO Rules

In [None]:
df_co = pd.DataFrame(read_json_lines(join(SKILL_OUT, 'fr_coursera_extracted_skills_v2.jsonl')))

In [None]:
df_co

Unnamed: 0,id,skills
0,NJSdGN71Eeq4CApSN3OTvQ,c#
1,DMkcgX7LEeyRTg6FtAvfBw,compétences de base|différents types
2,YLO0oGSUEeyIUg4Qv2RsBQ,
3,69Bku0KoEeWZtA4u62x6lQ,
4,0HiU7Oe4EeWTAQ4yevf_oQ,
...,...,...
10175,QJlWBW30Eeq2hwr9iuARBQ,
10176,XSLeWQ6pEeu9ZBLzQTJEhw,
10177,7ErrkakwEeyxDg4ukgkVlw,
10178,xcqAU7oaEeq1DQ4cuiU-Sw,


In [None]:
translate_skills(df_co, join(ACC_DIR, 'en_skills_coursera_transformers.jsonl'), 'id')

  0%|          | 0/10180 [00:00<?, ?it/s]

### Extracted Using Pole Emploi Rules

In [None]:
df_co = pd.DataFrame(read_json_lines(join(SKILL_OUT, 'fr_coursera_extracted_skills_pole_emploi_v1.jsonl')))

In [None]:
df_co

Unnamed: 0,id,skills
0,NJSdGN71Eeq4CApSN3OTvQ,
1,DMkcgX7LEeyRTg6FtAvfBw,différents types|intégrer des applications|tab...
2,YLO0oGSUEeyIUg4Qv2RsBQ,
3,69Bku0KoEeWZtA4u62x6lQ,business
4,0HiU7Oe4EeWTAQ4yevf_oQ,
...,...,...
10175,QJlWBW30Eeq2hwr9iuARBQ,
10176,XSLeWQ6pEeu9ZBLzQTJEhw,
10177,7ErrkakwEeyxDg4ukgkVlw,
10178,xcqAU7oaEeq1DQ4cuiU-Sw,


In [None]:
translate_skills(df_co, join(ACC_DIR, 'en_skills_coursera_PE_transformers.jsonl'), 'id')

  0%|          | 0/10180 [00:00<?, ?it/s]