In [10]:
import pickle 
import en_core_web_sm
from escopy.text_cleaning_utils import clean_text
from escopy.extract_skills import detect_skills
import pandas as pd 
from tqdm import tqdm

In [3]:

path = 'surface_form_matcher_v02_1.pickle'
model = pickle.load(open(path, "rb"))
nlp = en_core_web_sm.load()


In [4]:
# main 

job_description = """
Advanced skills in SQL, Python, or similar technologies
Experience with Time Series Analysis
Experience with REST APIs and relational databases
Excellent written and verbal communication skills
Ability to clearly document findings and summarise discussions
Drive for excellence, strong attention to detail
Excellent organizational and follow-up skills
Tech-savvy and passionate about new technologies
Bachelor's or Master's Degree required in Computer Science, Mathematics or other related fields.
European Union’s legal working status.
Exposure to AWS cloud services: S3, Athena, Aurora, Elasticsearch, Dynamodb, etc...
Experience with working with database management systems.
"""

# clean text 
job_description = clean_text(job_description)
# extract skill using NESTA skill extractor 
annotations = detect_skills(job_description, model, nlp, return_dict=True, debug=False)


[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# annotation  exploration 
import pprint
print(f'annotations count : {len(annotations)}')
print('annotation example description : ')
pprint.pprint(annotations[0]) 


annotations count : 9
annotation example description : 
{'cluster_0': 0.0,
 'cluster_1': 0.0,
 'cluster_2': 0.0,
 'entity': 1139,
 'label_cluster_0': 'Transversal skills',
 'label_cluster_1': 'General Workplace Skills',
 'label_cluster_2': 'General Workplace Skills',
 'predicted_q': 0.4968150938837877,
 'preferred_label': 'communication',
 'surface_form': 'communication',
 'surface_form_type': 'label_pref'}


In [None]:
# enrich a csv file of job postings 

job_description_col_name = 'job_description'
print(' load file ...')
path = ''
data_input = pd.read_csv(path)

print('extract skill ...')
# get job description text list 
job_descriptions = data_input[job_description_col_name].values
# create skill extractor util 
def get_skills(text):
    try : 

        #clean text 
        text = clean_text(text)
        # extract skill using NESTA skill extractor 
        annotations = detect_skills(text, model, nlp, return_dict=True, debug=False)
        # prepare output data [skill_id] (a list of skill ids )
        skill_ids = [annotation['entity'] for annotation in annotations]
        return skill_ids 

    except : 
        return []



    
# apply 
skills_list = []
for job_description in tqdm(job_descriptions):
    job_skills = get_skills(job_description)
    # stringify 
    job_skills_str = ';'.join(job_skills)
    skills_list.append(job_skills_str)

print('merge in input data ...')

output_col_name ='esco_skills'
data_input[output_col_name] = skills_list


    


In [13]:
# load json 
import json

with open('./data/rekrute_batch_1.json') as json_file:
    data_a = json.load(json_file)
 
with open('./data/rekrute_batch_2.json') as json_file:
    data_b = json.load(json_file)


In [17]:
full_data = data_a|data_b

In [22]:
def get_row_elems(sample):

    title = sample['title']
    country = sample['country']
    location = sample['location']
    date = sample['date']
    sector = sample['sectros']
    contract_type = sample['contract_type']
    exp = sample['exp_text']
    study_level = sample['study_level']
    job_description = ' '.join(sample['skills_text']) 

    return [title,country,location,date,sector,contract_type,exp,study_level,job_description]

rows = []
for key in full_data : 
    sample = full_data[key]
    row = get_row_elems(sample)
    rows.append(sample)

# create df 
cols = ['title','country','location','date','sector','contract_type','exp','study_level','job_description']
rekrute_data = pd.DataFrame(rows , columns = cols)
