In [21]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
#import pyLDAvis.gensim_models
#from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import json
import jsonlines
from collections import Counter
from tqdm.notebook import tqdm
import gc

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet', 'omw-1.4'])

#warning
import warnings 
#from PyPDF2 import PdfFileReader
from os.path import join
from tqdm.notebook import tqdm
import os

import torch
from torch.utils import data as t_data
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from multiprocessing.pool import ThreadPool
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

tqdm.pandas()

warnings.filterwarnings('ignore')

ACC_DIR = 'data/transformed_data/acceptability'
SKILL_OUT = 'data/transformed_data/extracted_skills'


[nltk_data] Downloading package stopwords to /home/vortex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vortex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vortex/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:

classifier = pipeline(task="text-classification", model='textattack/albert-base-v2-CoLA', device = 0, padding='max_length', truncation=True)


In [23]:
classifier.device

device(type='cuda', index=0)

In [24]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

def get_clean_transformer_data(df):
    df = df.explode('en_skills').reset_index(drop=True)
    df['en_skills'] = df['en_skills'].str.replace("I'm sorry, but I don't know.", '')
    return df

def infer_(df, classfier):
    all_data = []
    for k,g in tqdm(df.groupby(np.arange(len(df))//2000)):
#         return g
        group_skills = list(g['en_skills'])
        ids = list(g['id'])
#         print(group_skills)
        all_classes = classfier(group_skills)
        for ix, i in enumerate(all_classes):
            all_classes[ix]['id'] = ids[ix]
        all_data += all_classes
    return all_data

def process_acceptability_df(acc_df):
    labels_series = pd.DataFrame(acc_df).groupby('id')['label'].apply(list)
    score_series = pd.DataFrame(acc_df).groupby('id')['score'].apply(list)
    return labels_series, score_series
    
def concate_tables(acc_df):
    labels_series, score_series = process_acceptability_df(acc_df)
    original_df = pd.DataFrame(read_json_lines(join(SKILL_OUT, 'fr_extracted_skills_v3.jsonl')))
    original_df = original_df.set_index('job_offer_id')
    translated_ = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_ESCO_transformers.jsonl')))
    translated_ = translated_.set_index('id')
    original_df['en_skills'] = translated_['en_skills']
    original_df['labels'] = labels_series
    original_df['scores'] = score_series
    return original_df
#         return all_classes

## Transformer Acceptability

### Job Offers: Translating skills extracted using ESCO extracted patterns

In [5]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_ESCO_transformers.jsonl')))

In [6]:
df

Unnamed: 0,id,en_skills
0,0,"[use of equipment, s--------------------------..."
1,1,"[I'm sorry, but I don't know.]"
2,2,"[I'm sorry, but I don't know.]"
3,110,"[I'm sorry, but I don't know.]"
4,1015,[mechanical]
...,...,...
115717,83520,"[improve assortment, analyse indicators on mar..."
115718,83519,"[respond to requests, Technical Experts, defin..."
115719,71931,"[close collaboration, propose a publication sc..."
115720,62300,[s--------------------------------------------...


In [7]:
df_ = get_clean_transformer_data(df)

In [8]:
df_.head()

Unnamed: 0,id,en_skills
0,0,use of equipment
1,0,s---------------------------------------------...
2,0,resolve incidents
3,0,physical architectures
4,0,working environment


In [9]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/492 [00:00<?, ?it/s]

In [15]:
df

Unnamed: 0,id,en_skills
0,0,"[use of equipment, s--------------------------..."
1,1,"[I'm sorry, but I don't know.]"
2,2,"[I'm sorry, but I don't know.]"
3,110,"[I'm sorry, but I don't know.]"
4,1015,[mechanical]
...,...,...
115717,83520,"[improve assortment, analyse indicators on mar..."
115718,83519,"[respond to requests, Technical Experts, defin..."
115719,71931,"[close collaboration, propose a publication sc..."
115720,62300,[s--------------------------------------------...


In [22]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_ESCO_transformers_raw_acceptability.csv'), index = False)

### Job Offers: Translating skills extracted using Pole Emploi extracted patterns

In [5]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_PE_transformers.jsonl')))

In [6]:
df

Unnamed: 0,id,en_skills
0,0,"[Customer Officer, use of equipment, Technical..."
1,2,[support a team]
2,1015,"[production management, mechanical]"
3,1016,[conduct of operations]
4,1017,"[Management information technology, managing d..."
...,...,...
116628,83520,"[development strategy, analyse indicators on m..."
116629,83519,"[validate the solution, Project management, fo..."
116630,71931,"[social networks, propose a publication schedu..."
116631,62300,"[implementation, Social balance sheet, dashboa..."


In [7]:
df_ = get_clean_transformer_data(df)

In [8]:
df_.shape

(784099, 2)

In [9]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/393 [00:00<?, ?it/s]

In [11]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_PE_transformers_raw_acceptability.csv'), index = False)

### Coursera: Translating skills extracted using  ESCO extracted patterns

In [36]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_coursera_transformers.jsonl')))

In [37]:
df

Unnamed: 0,id,en_skills
0,NJSdGN71Eeq4CApSN3OTvQ,[c#]
1,DMkcgX7LEeyRTg6FtAvfBw,"[basic skills, different types]"
2,rrZX06kxEeyjPxJkzIUqCw,[self-]
3,B6LZjKkwEeyjPxJkzIUqCw,[self-]
4,RdmZEakyEeyjPxJkzIUqCw,[self-]
...,...,...
1266,04e4_kIxEeuVrwrlQzvd6w,[Basic principles]
1267,N7ghLqkpEeyjPxJkzIUqCw,[self-]
1268,Zz0lopT9EeyHuA5cIqn4NQ,[test components]
1269,3bL_2qkxEeyjPxJkzIUqCw,[self-]


In [38]:
df_ = get_clean_transformer_data(df)

In [39]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_coursera_transformers_raw_acceptability.csv'), index = False)

In [41]:
pd.read_csv(join(ACC_DIR, 'en_skills_coursera_transformers_raw_acceptability.csv'))

Unnamed: 0,label,score,id
0,LABEL_1,0.929184,NJSdGN71Eeq4CApSN3OTvQ
1,LABEL_1,0.866787,DMkcgX7LEeyRTg6FtAvfBw
2,LABEL_1,0.916817,DMkcgX7LEeyRTg6FtAvfBw
3,LABEL_0,0.915861,rrZX06kxEeyjPxJkzIUqCw
4,LABEL_0,0.915861,B6LZjKkwEeyjPxJkzIUqCw
...,...,...,...
3612,LABEL_1,0.881153,04e4_kIxEeuVrwrlQzvd6w
3613,LABEL_0,0.915861,N7ghLqkpEeyjPxJkzIUqCw
3614,LABEL_1,0.701536,Zz0lopT9EeyHuA5cIqn4NQ
3615,LABEL_0,0.915861,3bL_2qkxEeyjPxJkzIUqCw


### Coursera: Translating skills extracted using  Pol Emploi extracted patterns

In [18]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_coursera_PE_transformers.jsonl')))

In [19]:
df_ = get_clean_transformer_data(df)

In [20]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_coursera_PE_transformers_raw_acceptability.csv'), index = False)

## Argos Acceptability

### Job Offers: Translating skills extracted using ESCO extracted patterns

In [5]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_ESCO_argos.jsonl')))

In [6]:
df_ = get_clean_transformer_data(df)

In [7]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/497 [00:00<?, ?it/s]

In [8]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_ESCO_argos_raw_acceptability.csv'), index = False)

### Job Offers: Translating skills extracted using Pole Emploi extracted patterns

In [9]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_pole_emploi_argos.jsonl')))


In [10]:
df_ = get_clean_transformer_data(df)

In [11]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/397 [00:00<?, ?it/s]

In [12]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_pole_emploi_argos_raw_acceptability.csv'), index = False)

In [31]:
pd.read_csv(join(ACC_DIR, 'en_skills_pole_emploi_argos_raw_acceptability.csv'))

Unnamed: 0,label,score,id
0,LABEL_1,0.581924,0
1,LABEL_1,0.841982,0
2,LABEL_1,0.756916,0
3,LABEL_1,0.687168,0
4,LABEL_1,0.765600,0
...,...,...,...
792483,LABEL_1,0.910732,51718
792484,LABEL_1,0.842067,51718
792485,LABEL_1,0.604587,51718
792486,LABEL_1,0.933235,51718


### Coursera: Translating skills extracted using  ESCO extracted patterns

In [27]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_coursera_argos.jsonl')))


In [28]:
df

Unnamed: 0,id,en_skills
0,NJSdGN71Eeq4CApSN3OTvQ,[c#]
1,DMkcgX7LEeyRTg6FtAvfBw,"[core competencies, different types]"
2,YLO0oGSUEeyIUg4Qv2RsBQ,[]
3,69Bku0KoEeWZtA4u62x6lQ,[]
4,0HiU7Oe4EeWTAQ4yevf_oQ,[]
...,...,...
10175,QJlWBW30Eeq2hwr9iuARBQ,[]
10176,XSLeWQ6pEeu9ZBLzQTJEhw,[]
10177,7ErrkakwEeyxDg4ukgkVlw,[]
10178,xcqAU7oaEeq1DQ4cuiU-Sw,[]


In [14]:
df_ = get_clean_transformer_data(df)

In [15]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/7 [00:00<?, ?it/s]

In [16]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_coursera_argos_raw_acceptability.csv'), index = False)

In [29]:
pd.read_csv(join(ACC_DIR, 'en_skills_coursera_argos_raw_acceptability.csv'))

Unnamed: 0,label,score,id
0,LABEL_1,0.929184,NJSdGN71Eeq4CApSN3OTvQ
1,LABEL_1,0.886631,DMkcgX7LEeyRTg6FtAvfBw
2,LABEL_1,0.916817,DMkcgX7LEeyRTg6FtAvfBw
3,LABEL_1,0.743641,YLO0oGSUEeyIUg4Qv2RsBQ
4,LABEL_1,0.743641,69Bku0KoEeWZtA4u62x6lQ
...,...,...,...
12521,LABEL_1,0.743641,QJlWBW30Eeq2hwr9iuARBQ
12522,LABEL_1,0.743641,XSLeWQ6pEeu9ZBLzQTJEhw
12523,LABEL_1,0.743641,7ErrkakwEeyxDg4ukgkVlw
12524,LABEL_1,0.743641,xcqAU7oaEeq1DQ4cuiU-Sw


### Coursera: Translating skills extracted using  Pol Emploi extracted patterns

In [25]:
df = pd.DataFrame(read_json_lines(join(ACC_DIR, 'en_skills_coursera_PE_argos.jsonl')))

In [26]:
df

Unnamed: 0,id,en_skills
0,NJSdGN71Eeq4CApSN3OTvQ,[]
1,DMkcgX7LEeyRTg6FtAvfBw,"[different types, integrate applications, dash..."
2,YLO0oGSUEeyIUg4Qv2RsBQ,[]
3,69Bku0KoEeWZtA4u62x6lQ,[business]
4,0HiU7Oe4EeWTAQ4yevf_oQ,[]
...,...,...
10175,QJlWBW30Eeq2hwr9iuARBQ,[]
10176,XSLeWQ6pEeu9ZBLzQTJEhw,[]
10177,7ErrkakwEeyxDg4ukgkVlw,[]
10178,xcqAU7oaEeq1DQ4cuiU-Sw,[]


In [18]:
df_ = get_clean_transformer_data(df)

In [19]:
all_extracted_acceptability = infer_(df_, classifier)

  0%|          | 0/6 [00:00<?, ?it/s]

In [20]:
pd.DataFrame(all_extracted_acceptability).to_csv(join(ACC_DIR, 'en_skills_coursera_PE_argos_raw_acceptability.csv'), index = False)

In [30]:
pd.read_csv(join(ACC_DIR, 'en_skills_coursera_PE_argos_raw_acceptability.csv'))

Unnamed: 0,label,score,id
0,LABEL_1,0.743641,NJSdGN71Eeq4CApSN3OTvQ
1,LABEL_1,0.916817,DMkcgX7LEeyRTg6FtAvfBw
2,LABEL_1,0.782994,DMkcgX7LEeyRTg6FtAvfBw
3,LABEL_1,0.688642,DMkcgX7LEeyRTg6FtAvfBw
4,LABEL_1,0.743641,YLO0oGSUEeyIUg4Qv2RsBQ
...,...,...,...
11458,LABEL_1,0.743641,QJlWBW30Eeq2hwr9iuARBQ
11459,LABEL_1,0.743641,XSLeWQ6pEeu9ZBLzQTJEhw
11460,LABEL_1,0.743641,7ErrkakwEeyxDg4ukgkVlw
11461,LABEL_1,0.743641,xcqAU7oaEeq1DQ4cuiU-Sw


## Process Acceptability

In [24]:
accept_df = pd.read_csv(join(ACC_DIR, 'en_skills_ESCO_transformers_raw_acceptability.csv'))

In [25]:
pd.DataFrame(accept_df).groupby('id')['label'].apply(list)


id
0         [LABEL_1, LABEL_0, LABEL_1, LABEL_1, LABEL_1, ...
1                                                 [LABEL_1]
2                                                 [LABEL_1]
110                                               [LABEL_1]
1015                                              [LABEL_1]
                                ...                        
142148                 [LABEL_1, LABEL_1, LABEL_0, LABEL_1]
142149                                            [LABEL_1]
142150    [LABEL_1, LABEL_1, LABEL_0, LABEL_1, LABEL_1, ...
142151    [LABEL_1, LABEL_1, LABEL_1, LABEL_1, LABEL_1, ...
142152    [LABEL_0, LABEL_1, LABEL_0, LABEL_1, LABEL_1, ...
Name: label, Length: 115722, dtype: object

In [26]:
pd.DataFrame(accept_df).groupby('id')['score'].apply(list)


id
0         [0.8419815301895142, 0.7044380307197571, 0.815...
1                                      [0.7436407804489136]
2                                      [0.7436407804489136]
110                                    [0.7436407804489136]
1015                                   [0.7579735517501831]
                                ...                        
142148    [0.5893144011497498, 0.6201562285423279, 0.561...
142149                                 [0.5525694489479065]
142150    [0.8487229347229004, 0.8083083033561707, 0.737...
142151    [0.958845555782318, 0.6138927936553955, 0.9559...
142152    [0.7044380307197571, 0.7916160225868225, 0.511...
Name: score, Length: 115722, dtype: object

In [36]:
concate_tables(accept_df).to_csv(join(ACC_DIR,'full_accept_results_en_skills_ESCO.csv'))

In [10]:
# padding='max_length', truncation=True

In [22]:
pd.read_csv(join(ACC_DIR,'full_accept_results_en_skills_ESCO.csv'))

Unnamed: 0,job_offer_id,skills,en_skills,labels,scores
0,0,utilisation de matériel|s’|résoudre des incide...,"['use of equipment', 's-----------------------...","['LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1', '...","[0.8419815301895142, 0.7044380307197571, 0.815..."
1,1,,"[""I'm sorry, but I don't know.""]",['LABEL_1'],[0.7436407804489136]
2,2,,"[""I'm sorry, but I don't know.""]",['LABEL_1'],[0.7436407804489136]
3,110,,"[""I'm sorry, but I don't know.""]",['LABEL_1'],[0.7436407804489136]
4,1015,mécanique,['mechanical'],['LABEL_1'],[0.7579735517501831]
...,...,...,...,...,...
123986,52062,bonne connaissance|assurer la surveillance des...,"['good knowledge', 'ensure monitoring of appli...","['LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_0', '...","[0.5611529350280762, 0.8978632688522339, 0.676..."
123987,51878,assurer le contrôle de la disponibilité au niv...,"['ensuring control of stock availability', 'go...","['LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_1', '...","[0.9361907243728638, 0.5611529350280762, 0.682..."
123988,51877,s’|maîtriser les règles de sécurité|suivre des...,['s-------------------------------------------...,"['LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_1', '...","[0.7044380307197571, 0.8789429068565369, 0.831..."
123989,51765,management de la qualité|outil informatique|as...,"['quality management', 'IT tool', 'ensuring th...","['LABEL_1', 'LABEL_0', 'LABEL_1', 'LABEL_1']","[0.742351770401001, 0.9530846476554872, 0.8749..."


In [12]:
classify_acceptability(df, join(ACC_DIR, 'en_skills_ESCO_transformers_acceptability.jsonl'))

  0%|          | 0/115722 [00:00<?, ?it/s]

In [27]:
pd.read_csv(join(ACC_DIR,'full_accept_results_en_skills_ESCO.csv'))['en_skills'].map(json.loads)

JSONDecodeError: Expecting value: line 1 column 2 (char 1)