In [1]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import json
import jsonlines
from collections import Counter
from tqdm.notebook import tqdm

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet', 'omw-1.4'])

#warning
import warnings 
#from PyPDF2 import PdfFileReader
from os.path import join
from tqdm.notebook import tqdm
tqdm.pandas()

warnings.filterwarnings('ignore')
DATA_FILES = 'data/rules'
PRO_FILES = 'data/source_data/SkillDB'
SKILL_OUT = 'data/transformed_data/extracted_skills'
JO_OUT = 'data/transformed_data/job_offers_out'

2023-02-09 13:34:23.911708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-09 13:34:24.454248: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/pc/anaconda3/envs/testenv1/lib
2023-02-09 13:34:24.454290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/pc/anaconda3/envs/testenv1/lib
2023-02-09 13:34:25.175956: I tensorflow/compiler/xla/

In [2]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines



In [3]:
fr_nlp = spacy.load('fr_core_news_sm')  # download model first
en_nlp = spacy.load('en_core_web_sm')
# fr_core_news_sm

## Rule Creation for French Skills

In [4]:
def continued_sequence(tst):
    not_all = True
    prev_len = len(tst)
    new_len = len(tst)
    while not_all:
      for i in range(len(tst)):
          if i < len(tst)-1:
              if (i>1) & (tst[i+1]-tst[i]==1):
                  # print(tst[i])
                  # tst[i] = tst[i+1]
                  del tst[i]
                  not_all = True
                  break
      new_len = len(tst)
      if new_len == prev_len:
          break
      else:
          prev_len = new_len
    return tst

In [5]:
def get_sent_meta(sent):
    all_sents = []
    old_sent = ''
    for idx, token in enumerate(sent):
        old_sent += ' '+ token.text
        sent_meta = {}
        sent_meta['pos'] = token.pos_
        sent_meta['text'] = token.text
        sent_meta['lemma'] = token.lemma_
        sent_meta['i'] = token.i
        all_sents.append(sent_meta)
    return pd.DataFrame(all_sents)

def get_sub_phrases(sent):
    sent_df = get_sent_meta(sent)
    df_meta_ = sent_df[sent_df['pos'].isin(['NOUN','ADJ'])]
    df_meta_i = continued_sequence(list(df_meta_['i']))
    res = []
    for i in range(len(df_meta_i)):
        if i < len(df_meta_i)-1:
          phrase = ' '.join(list(sent_df[(sent_df['i']>=df_meta_i[i])&(sent_df['i']<=df_meta_i[i+1])]['text']))
          # print(phrase)
          res.append(phrase)
          phrase = ' '.join(list(sent_df[(sent_df['i']>=df_meta_i[i])&(sent_df['i']<=df_meta_i[i+1])]['lemma']))
          # print(phrase)
          res.append(phrase)
    return res

In [6]:
def segment_skill(sent):
    all_sents = []
    all_sents_rep = []
    old_sent = ''
    old_phrase_sent = ''
    first_noun = False
    first_noun_ = True
    for idx, token in enumerate(sent):
        old_sent += ' '+ token.text
        if first_noun:
            old_phrase_sent += ' '+ token.text
        if (idx == 0) & (token.pos_ == 'VERB'):
            all_sents.append(old_sent.strip())
        elif (token.pos_ == 'NOUN') | (token.pos_ == 'ADJ'):
            first_noun  = True
            if len(old_sent.strip().split())>1:
                all_sents.append(old_sent.strip())
            if first_noun & first_noun_:
                old_phrase_sent += ' '+ token.text
                first_noun_ = False
            if len(old_phrase_sent.strip().split())>1:
                all_sents_rep.append(old_phrase_sent.strip())
    return list(set(all_sents + all_sents_rep))

def segment_skill_rules(sent):
    all_sents = {}
    all_sents_rep = {}
    all_sents_text = []
    all_sents_rep_text = []
    old_sent = ''
    old_phrase_sent = ''
    first_noun = False
    first_noun_ = True
    prev_pos = []
    all_pos = [token.pos_ for token in sent]
    
    for idx, token in enumerate(sent):
        old_sent += ' '+ token.text
        prev_pos.append(token.pos_)
        if first_noun:
            old_phrase_sent += ' '+ token.text
        if (idx == 0) & (token.pos_ == 'VERB'):
            rule_form = {}
            skill = old_sent.strip()
#             rule_form['label'] = 'SKILL'
#             rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
#             all_sents.append(rule_form)
            rule_form = {}
            rule_form['label'] = 'SKILL'
            rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
            first_ = True
            for idx_, token_ in enumerate(sent):
                if idx_> idx:
#                     if first_:
#                         first_ = False
#                         rule_form['pattern'][0].update({'POS':token_.pos_})
#                     else:
                    rule_form['pattern'].append({'POS':token_.pos_})
            all_sents[skill] = rule_form
            all_sents_text.append(skill)
#             all_sents.append(old_sent.strip())
        elif (token.pos_ == 'NOUN') | (token.pos_ == 'ADJ'):
            first_noun  = True
            if len(old_sent.strip().split())>1:
                skill = old_sent.strip()
                rule_form = {}
                rule_form['label'] = 'SKILL'
                rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
                all_sents[skill] = rule_form
                all_sents_text.append(skill)
#                 all_sents.append(rule_form)
            if first_noun & first_noun_:
                old_phrase_sent += ' '+ token.text
                first_noun_ = False
            if len(old_phrase_sent.strip().split())>1:
                skill = old_phrase_sent.strip()
                rule_form = {}
                rule_form['label'] = 'SKILL'
                rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
                all_sents_rep[skill] = rule_form
#                 all_sents_rep.append(rule_form)
                all_sents_rep_text.append(skill)
    all_sents.update(all_sents_rep)
    return all_sents, list(set(all_sents_text + all_sents_rep_text))

def get_rules(skills_file, file_name):
  for ind, row in skills_file.iterrows():
      skill = row['name']
      rule_form = {}
      rule_form['label'] = 'SKILL'
      rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
      write_json_lines(file_name,rule_form)

def get_skill_patterns(skill_name):
    all_skills = []
    skills = fr_nlp(skill_name)
    skills = list(skills.sents)[0]
    print('-'*12)
    rules_, text_ = segment_skill_rules(sent = skills)
    print(rules_)
    print('/'*12)
    print(text_)
    all_skill_pat = get_sub_phrases(skills)
    all_skills += all_skill_pat
    print('-'*12)
    print(all_skill_pat)
    all_skill_pat = segment_skill(skills)
    all_skills += all_skill_pat
    print('-'*12)
    print(all_skill_pat)
    all_skills = list(set(all_skills))
    return all_skills

def get_skill_patterns_v2(skill_name):
    all_rules = {}
    skills = fr_nlp(skill_name)
    skills = list(skills.sents)[0]
    rules_, text_ = segment_skill_rules(sent = skills)
#     print(rules_)
    all_rules.update(rules_)
    all_skill_pat = get_sub_phrases(skills)
    for skill in all_skill_pat:
        rule_form = {}
        rule_form['label'] = 'SKILL'
        rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
        all_rules[skill] = rule_form
    return all_rules

def get_rules_v2(skills_file, file_name):
  pbar = tqdm(total = skills_file.shape[0])
  for ind, row in skills_file.iterrows():
      skills = row['preferredLabel']
      all_skills = get_skill_patterns(skill_name = skills)
      for skill in all_skills:
          rule_form = {}
          rule_form['label'] = 'SKILL'
          rule_form['pattern'] = [{'LOWER':word.lower()} for word in skill.split(' ')]
          # print(rule_form)
          write_json_lines(file_name,rule_form)
      pbar.update()

def get_rules_v3(skills_file, file_name):
  pbar = tqdm(total = skills_file.shape[0])
  for ind, row in skills_file.iterrows():
      skills = row['preferredLabel']
      all_rules = list(get_skill_patterns_v2(skill_name = skills).values())
      for rule_form in all_rules:
          write_json_lines(file_name,rule_form)
      pbar.update()

def get_rules_v4(skills_list, file_name):
  pbar = tqdm(total = len(skills_list))
  for skill in skills_list:
      all_rules = list(get_skill_patterns_v2(skill_name = skill).values())
      for rule_form in all_rules:
          write_json_lines(file_name,rule_form)
      pbar.update()

def unique_skills(x):
    return list(set(x))

In [7]:
df = pd.read_csv(join(PRO_FILES, 'ESCO_FR', 'skills_fr.csv'))

In [56]:
get_rules_v3(df, join(DATA_FILES, 'fr_skill_patterns_final.jsonl'))

  0%|          | 0/13896 [00:00<?, ?it/s]

In [None]:
df

In [10]:
get_skill_patterns(skill_name = list(df['preferredLabel'])[0])

------------
{'gérer': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'POS': 'DET'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]}, 'gérer le personnel': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'LOWER': 'le'}, {'LOWER': 'personnel'}]}, 'gérer le personnel musical': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'LOWER': 'le'}, {'LOWER': 'personnel'}, {'LOWER': 'musical'}]}, 'personnel musical': {'label': 'SKILL', 'pattern': [{'LOWER': 'personnel'}, {'LOWER': 'musical'}]}}
////////////
['gérer le personnel', 'gérer le personnel musical', 'gérer', 'personnel musical']
------------
['personnel musical', 'personnel musical']
------------
['gérer le personnel', 'gérer le personnel musical', 'gérer', 'personnel musical']


['personnel musical',
 'gérer le personnel musical',
 'gérer le personnel',
 'gérer']

In [45]:
list(get_skill_patterns_v2(skill_name = list(df['preferredLabel'])[0]).values())

{'gérer': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'POS': 'DET'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]}, 'gérer le personnel': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'LOWER': 'le'}, {'LOWER': 'personnel'}]}, 'gérer le personnel musical': {'label': 'SKILL', 'pattern': [{'LOWER': 'gérer'}, {'LOWER': 'le'}, {'LOWER': 'personnel'}, {'LOWER': 'musical'}]}, 'personnel musical': {'label': 'SKILL', 'pattern': [{'LOWER': 'personnel'}, {'LOWER': 'musical'}]}}


[{'label': 'SKILL',
  'pattern': [{'LOWER': 'gérer'},
   {'POS': 'DET'},
   {'POS': 'NOUN'},
   {'POS': 'NOUN'}]},
 {'label': 'SKILL',
  'pattern': [{'LOWER': 'gérer'}, {'LOWER': 'le'}, {'LOWER': 'personnel'}]},
 {'label': 'SKILL',
  'pattern': [{'LOWER': 'gérer'},
   {'LOWER': 'le'},
   {'LOWER': 'personnel'},
   {'LOWER': 'musical'}]},
 {'label': 'SKILL', 'pattern': [{'LOWER': 'personnel'}, {'LOWER': 'musical'}]}]

In [23]:
list(df['preferredLabel'])[0]

'gérer le personnel musical'

In [24]:
df

Unnamed: 0,conceptType,conceptUri,skillType,reuseLevel,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,scopeNote,definition,inScheme,description
0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0005c151-5b5a...,skill/competence,sector-specific,gérer le personnel musical,,,released,2016-12-20T17:43:43Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Attribuer et gérer les tâches du personnel dan...
1,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00064735-8fad...,skill/competence,occupation-specific,superviser des procédures correctionnelles,,,released,2016-12-20T20:17:49Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Superviser les opérations d’un établissement c...
2,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/000709ed-2be5...,skill/competence,sector-specific,appliquer des pratiques de travail social anti...,,,released,2016-12-20T19:18:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Identifier l’oppression dans les sociétés, les..."
3,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0007bdc2-dd15...,skill/competence,sector-specific,contrôler la conformité à la réglementation su...,,,released,2016-12-20T20:02:19Z,,,http://data.europa.eu/esco/concept-scheme/skil...,"Inspecter le matériel roulant, les composants ..."
4,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/00090cc1-1f27...,skill/competence,cross-sector,recenser les services disponibles,,,released,2016-12-20T20:15:17Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Identifier les différents services disponibles...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13891,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,skill/competence,sector-specific,restaurer les capacités professionnelles d’un ...,,,released,2016-12-20T19:25:53Z,,,http://data.europa.eu/esco/concept-scheme/memb...,Restaurer ou améliorer les composantes cogniti...
13892,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0b074-5a76...,skill/competence,sector-specific,installer l’éclairage dans des engins de trans...,,,released,2016-12-20T20:03:21Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Installer des éléments d’éclairage dans des éq...
13893,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,knowledge,sector-specific,traitement automatique du langage naturel,traitement automatique des langages naturels\n...,,released,2022-07-05T14:34:09.904Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Les technologies qui permettent aux dispositif...
13894,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/fff5bc45-b506...,skill/competence,cross-sector,coordonner des travaux de construction,,,released,2016-12-20T18:22:35Z,,,http://data.europa.eu/esco/concept-scheme/skil...,Coordonner les travaux de plusieurs ouvriers o...


In [57]:
gg = read_json_lines(join(DATA_FILES, 'fr_skill_patterns_final.jsonl'))

In [58]:
len(gg)

77862

### New Rules

In [22]:
def split_skills(skill_concat):
    if isinstance(skill_concat, str):
        skill_list = re.split("(?=,[A-Z])", skill_concat)
        skill_list = [i for i in skill_list if len(i)>0]
#         skill_list = [(i[:-1] if (i[-1]==',') else i).lower() for i in skill_list]
        skill_list = [(i[1:] if (i[0]==',') else i).lower() for i in skill_list]
        return skill_list
    else:
        return ['']
def get_all_pole_emploi_skills():
#     savoir_faire_base 	savoirs_base 	savoir_faire_specifiques 	savoirs_specifiques
    all_skills = []
    df = pd.read_csv(join(PRO_FILES, 'pole_emploi_complete.csv'))
    all_skills += sum([split_skills(i) for i in list(df['savoir_faire_base'])],[])
    all_skills += sum([split_skills(i) for i in list(df['savoirs_base'])],[])
    all_skills += sum([split_skills(i)  for i in list(df['savoir_faire_specifiques'])],[])
    all_skills += sum([split_skills(i)  for i in list(df['savoirs_specifiques'])],[])
    
    all_skills = [i.strip().lower() for i in all_skills if len(i)>0]
    return list(set(all_skills))

In [9]:
df = pd.read_csv(join(PRO_FILES, 'pole_emploi_complete.csv'))

In [10]:
list(df['savoir_faire_base'])

["Préparer le matériel, les matériaux et les outillages,Identifier le type d'intervention,Sécuriser un équipement,Atteler un équipement,Conduire les engins agricoles vers le lieu de production (champs, forêt), de stockage, de livraison (fermes, coopératives),Matérialiser une implantation de parcelle ou de passage,Approvisionner des engins d'exploitation,Réaliser une opération de maintenance",
 "Recenser les arbres à couper ou élaguer,Déterminer l'abattage ou l'élagage selon la trajectoire de chute des arbres en prenant en compte l'environnement et les conditions climatiques,Organiser l'abattage d'un arbre ou d'une branche,Réaliser l'abattage des arbres désignés (positionnement des coins, tronçonnage de la base, ...) selon la trajectoire de chute voulue,Elaguer les branches d'un arbre,Tailler les arbres selon l'orientation de croissance ou la forme recherchée et supprimer les branches indésirables,Débiter un arbre selon son usage,Entretenir des équipements,Assurer une maintenance de pre

In [14]:
df.shape

(521, 18)

In [23]:
pe_skills = get_all_pole_emploi_skills()

In [55]:
len(pe_skills)

11186

In [56]:
pe_skills

["techniques de réglage d'outillage",
 'surveiller ou réguler une installation de conditionnement, palettisation, étiquetage',
 "superviser un projet d'étude et développement",
 "surveiller l'état d'une plantation",
 'techniques de terrassement',
 'réaliser la préparation de pièces par refente',
 'produits frais',
 "délivrer des autorisations d'accès",
 'piloter la réalisation des entretiens annuels du personnel,établir un reporting social',
 'mettre en place des pages de gardes et de couvertures',
 'techniques de marouflage',
 'techniques de formation',
 'réaliser des relevés de sondage, forage, pollution des sols et sous-sols',
 "rédiger un conducteur d'émission tv",
 'poser des couvertures en pierre',
 "déterminer les évolutions des systèmes informatiques de gestion et de suivi d'une chaîne logistique",
 "ajuster la signalisation et actionner les systèmes de manoeuvre d'aiguillage selon la destination des trains",
 'chimie',
 'intervenir en établissement scolaire',
 'entraîneur de v

In [57]:
[i for i in pe_skills if len(i)==10]

['biérologie',
 'géomatique',
 'optométrie',
 'histologie',
 'géographie',
 'orthogénie',
 'géothermie',
 'optronique',
 'régulation',
 'cosmétique',
 'management',
 'radiologie',
 'neurologie',
 'aéraulique',
 'diagraphie',
 'généalogie',
 'argenterie',
 'handisport',
 'sociologie',
 'caséologie',
 'gnomonique',
 'mentalisme',
 'orthopédie',
 'métallerie',
 'tapisserie',
 'repoussage',
 'zootechnie',
 'endoscopie',
 'sismologie',
 'diététique',
 'hydrologie',
 'sportswear',
 'leadership',
 'métrologie',
 'sémiotique',
 'gemmologie',
 'e-business',
 'serrurerie',
 'ethnologie',
 'topométrie',
 'dosimétrie',
 'sténotypie',
 'e-commerce',
 'muséologie',
 'sémiologie']

In [38]:
# list(df['savoir_faire_base'])

In [39]:
# df

In [58]:
# get_rules_v3(df, join(DATA_FILES, 'fr_skill_patterns_pole_emploi_v3.jsonl'))
get_rules_v4(pe_skills, join(DATA_FILES, 'fr_skill_patterns_pole_emploi_v2.jsonl'))

  0%|          | 0/11186 [00:00<?, ?it/s]

## Rule Extraction

In [7]:
jo_df = pd.read_csv(join(DATA_REK, 'all_job_offers.csv'))

In [8]:
jo_df.head()

Unnamed: 0,job_title,city,function,sector,Expérience requise,Région,Niveau d'étude et formation,Type de contrat,Télétravail,soft_skills,...,Qualités requises pour réussir dans ce rôle,Ce que vous pouvez attendre de nous,pub_date,exp_date,pub_date_day,pub_date_month,pub_date_year,exp_date_day,exp_date_month,exp_date_year
0,Chargés de Clientèle en Assistance Technique,Casablanca,Call Centers (métiers de),Informatique / Electronique,De 1 à 3 ans,10 poste(s) sur Casablanca et région - Maroc,Bac +2,CDI,no info,,...,,,05/06/2017,05/08/2017,5,6,2017,5,8,2017
1,1 42 PHP 3,zerzer,Journalisme / Traduction,Administration des ventes / SAV,Débutant,Agadir et région - Afghanistan,Qualification avant Bac,CDD,no info,Rationalisme|Besoin d'autonomie|Recherche de n...,...,,,12/12/2005,12/12/2006,12,12,2005,12,12,2006
2,JAVA,PARIS (75),Communication / Publicité / RP,Audit / Conseil,De 5 à 10 ans,Meknès et région - Maroc,,CDI,no info,Flexibilité|Extraversion|Ambition|Recherche de...,...,,,28/01/2010,04/02/2010,28,1,2010,4,2,2010
3,110 Negotiator,Berlin,Audit / Conseil,Assurance / Courtage,Débutant,Autres régions - Albanie,Autodidacte,,no info,Flexibilité|Organisation|Rationalisme|Besoin d...,...,,,09/11/2005,29/12/2005,9,11,2005,29,12,2005
4,Responsable logistique et prodruction,Mohamédia,Production / Qualité / Sécurité / Maintenance,Aéronautique / Spatial,De 3 à 5 ans,3 poste(s) sur Casablanca et région - Maroc,,CDI,no info,Besoin d'objectivité|Ténacité|Organisation|Rat...,...,,,02/02/2006,02/03/2006,2,2,2006,2,3,2006


In [9]:
jo_df['Poste'] = jo_df['Poste'].replace(np.nan, '')
jo_df['Profil recherché'] = jo_df['Profil recherché'].replace(np.nan, '')
jo_df['Qualités requises pour réussir dans ce rôle'] = jo_df['Qualités requises pour réussir dans ce rôle'].replace(np.nan, '')

In [10]:
jo_df['post_data'] = jo_df['Poste']+jo_df['Profil recherché']+jo_df['Qualités requises pour réussir dans ce rôle']

In [11]:
jo_df.iloc[0,:]

job_title                                           Chargés de Clientèle en Assistance Technique
city                                                                                  Casablanca
function                                                               Call Centers (métiers de)
sector                                                               Informatique / Electronique
Expérience requise                                                                  De 1 à 3 ans
Région                                              10 poste(s) sur Casablanca et région - Maroc
Niveau d'étude et formation                                                               Bac +2
Type de contrat                                                                              CDI
Télétravail                                                                              no info
soft_skills                                                                                  NaN
Entreprise                    

In [12]:
jo_df['post_data'] = jo_df['post_data'].str.lower()

In [13]:
def get_skills(nlp,text):
    doc = fr_nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset

def get_all_skills(nlp,df, file_name):
    pbar = tqdm(total = df.shape[0])
    for i, rec in df.iterrows():
        added_data = {}
        post_data = rec['post_data']
        job_offer_id = rec['job_offer_id']
        post_data = post_data.lower()
        skills = get_skills(nlp, post_data)
        added_data['job_offer_id'] = job_offer_id
        added_data['skills'] = '|'.join(list(set(skills)))
        write_json_lines(file_name,added_data)
        pbar.update()

### French Rule Extraction

In [12]:
skill_pattern_path = join(DATA_FILES,"fr_skill_patterns_v2.jsonl")
ruler = fr_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x7f184d09e880>

In [13]:
get_all_skills(fr_nlp, jo_df, join(SKILL_OUT, 'fr_extracted_skills_v3.jsonl'))

  0%|          | 0/123991 [00:00<?, ?it/s]

In [14]:
jo_df.loc[0, "post_data"]

"Le chargé de clientèle en assistance technique a la principale mission de Recueillir puis traiter à distance les requêtes d'utilisateurs en difficulté au regard d'une utilisation de matériel informatique. Rattaché au responsable d'équipe, la mission consiste à : Gérer des demandes des utilisateurs par téléphone de manière optimale Diagnostiquer des problèmes système et réseau Prendre en compte et résoudre des incidents Traiter les demandes de changement en provenance des utilisateurs Avant de prendre en charge votre mission, une formation complète est assurée afin de vous préparer aux responsabilités qui vous seront confiées.Compétences: Parfaite maîtrise de la langue française, avec une bonne aisance à l’oral. Sens de l’écoute et de l’orientation. Bonne capacité de diagnostic et d'analyse. Connaissance générale des concepts de base et des techniques d’architecture des systèmes et des réseaux. Connaissance générale des différents systèmes d’exploitation usuels, des architectures matér

In [16]:
read_json_lines(join(SKILL_OUT, 'fr_extracted_skills_v3.jsonl'))

[{'job_offer_id': 0,
  'skills': 'utilisation de matériel|s’|résoudre des incidents|architectures matérielles|environnement de travail|gérer des demandes des utilisateurs par téléphone de manière|traiter à distance|diagnostiquer des problèmes système|traiter les demandes de changement en provenance|systèmes d’exploitation|prendre en charge|prendre en compte'},
 {'job_offer_id': 1, 'skills': ''},
 {'job_offer_id': 2, 'skills': ''},
 {'job_offer_id': 110, 'skills': ''},
 {'job_offer_id': 1015, 'skills': 'mécanique'},
 {'job_offer_id': 1016, 'skills': ''},
 {'job_offer_id': 1017, 'skills': 'gérer différentes tâches en même temps'},
 {'job_offer_id': 1019, 'skills': ''},
 {'job_offer_id': 1020,
  'skills': 'aspects financiers|connaissance des normes'},
 {'job_offer_id': 1021,
  'skills': "résolution des problèmes|veiller à l'atteinte du chiffre d'affaire|satisfaction des clients|avoir une vision globale|optimiser la logistique|autres services|assurer la responsabilité du chiffre|envoyer vo

### New french rule extraction: 

In [14]:
skill_pattern_path = join(DATA_FILES,"fr_skill_patterns_pole_emploi_v2.jsonl")
ruler = fr_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x7fe07ef23cc0>

In [None]:
get_all_skills(fr_nlp, jo_df, join(SKILL_OUT, 'extracted_skills_pole_emploi_v2.jsonl'))

In [17]:
read_json_lines(join(SKILL_OUT, 'extracted_skills_pole_emploi_v2.jsonl'))

[{'job_offer_id': 0,
  'skills': 'chargé de clientèle|utilisation de matériel|assistance technique|gérer des demandes des utilisateurs par téléphone|prendre en charge|traiter les demandes de changement|matérielles, logicielles|architecture des systèmes'},
 {'job_offer_id': 1, 'skills': ''},
 {'job_offer_id': 2, 'skills': 'encadrer une équipe'},
 {'job_offer_id': 110, 'skills': ''},
 {'job_offer_id': 1015, 'skills': 'gestion de production|mécanique'},
 {'job_offer_id': 1016, 'skills': "conduite d'opérations"},
 {'job_offer_id': 1017,
  'skills': 'informatique de gestion|gérer différentes tâches'},
 {'job_offer_id': 1019, 'skills': ''},
 {'job_offer_id': 1020,
  'skills': 'contrôle de gestion|normes comptables|gestion du stress'},
 {'job_offer_id': 1021,
  'skills': 'entretien annuel|gestion du stress|assurer la responsabilité du chiffre|business|différents éléments'},
 {'job_offer_id': 1022,
  'skills': 'entretien annuel|gestion du stress|assurer la responsabilité du chiffre|business|di

### English Skill Extraction

In [None]:
skill_pattern_path = join(DATA_FILES,"jz_skill_patterns.jsonl")
ruler = en_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

In [None]:
get_all_skills(en_nlp, jo_df, join(SKILL_OUT, 'jz_extracted_skills_v3.jsonl'))

## Coursera Hard Skill extraction

In [4]:
def get_skills(nlp,text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset

def get_all_skills(nlp, df, file_name, id_col, id_text):
    pbar = tqdm(total = df.shape[0])
    for i, rec in df.iterrows():
        added_data = {}
        post_data = rec[id_text]
        job_offer_id = rec[id_col]
        post_data = post_data.lower()
        skills = get_skills(nlp, post_data)
        added_data[id_col] = job_offer_id
        added_data['skills'] = '|'.join(list(set(skills)))
        write_json_lines(file_name,added_data)
        pbar.update()

In [5]:
cours_df = pd.DataFrame(read_json_lines(file_name=join('coursera', 'coursera_courses.jsonl')))

In [6]:
cours_df.head()

Unnamed: 0,courseType,description,id,slug,instructorIds,specializations,partnerIds,name,page
0,v2.ondemand,"In this two-hour, project-based course, we int...",NJSdGN71Eeq4CApSN3OTvQ,make-pick-ups-look-cool-unity-introduction-ani...,[4730641],[],[565],Make Your Pick-Ups Look Cool in Unity (Intro t...,0
1,v2.ondemand,"À la fin de ce projet, vous aurez toutes les c...",DMkcgX7LEeyRTg6FtAvfBw,integrer-applications-dashboard-hootsuite,[26958800],[],[565],Intégrer des applications dans votre Dashboard...,0
2,v2.ondemand,ينشئ العديد من الأشخاص حسابات وسائط اجتماعية م...,YLO0oGSUEeyIUg4Qv2RsBQ,getting-started-with-hootsuite-ar,[76377122],[],[565],كيفية استعمال التطبيق هووتسويت,0
3,v2.ondemand,Gamification is the application of game elemen...,69Bku0KoEeWZtA4u62x6lQ,gamification,[226710],[],[6],Gamification,0
4,v2.ondemand,This course will cover the steps used in weigh...,0HiU7Oe4EeWTAQ4yevf_oQ,missing-data,[8394050],[],[32],Dealing With Missing Data,0


In [7]:
cours_df['all_text'] = cours_df['description']+' '+cours_df['name']

### French Rule Extraction

In [14]:
skill_pattern_path = join(DATA_FILES,"fr_skill_patterns_final.jsonl")
# ruler = fr_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x7f9a41c62980>

In [15]:
get_all_skills(fr_nlp, cours_df, join(SKILL_OUT, 'fr_coursera_extracted_skills_v2.jsonl'), 'id', 'all_text')

  0%|          | 0/10180 [00:00<?, ?it/s]

### New French Rule Extraction

In [8]:
skill_pattern_path = join(DATA_FILES,"fr_skill_patterns_pole_emploi_v2.jsonl")
ruler = fr_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x7f0d96384400>

In [9]:
get_all_skills(fr_nlp, cours_df, join(SKILL_OUT, 'fr_coursera_extracted_skills_pole_emploi_v1.jsonl'), 'id', 'all_text')

  0%|          | 0/10180 [00:00<?, ?it/s]

In [None]:
read_json_lines(join(SKILL_OUT, 'fr_coursera_extracted_skills_pole_emploi_v1.jsonl'))

### English Skill Extraction

In [None]:
skill_pattern_path = join(DATA_FILES,"jz_skill_patterns.jsonl")
ruler = en_nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

In [None]:
get_all_skills(en_nlp, cours_df, join(SKILL_OUT, 'jz_coursera_extracted_skills_v1.jsonl'), 'id', 'all_text')