In [None]:
import pandas as pd 
from bs4 import BeautifulSoup
import re
import langid
import yaml
from typing import List

import spacy
from spacy.matcher import PhraseMatcher
# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor

In [None]:
def clean_from_html(s: str) -> str:
    '''
    This functions returns a string clean from html symbols.
    '''
    bs_result = BeautifulSoup(s, 'lxml').text
    result = re.sub('\n', ' ', bs_result)
    return result

def read_csv_to_list(file_name: str) -> List[str]:
    '''
    This functions returns a string cleaned from html symbols.
    '''
    my_file = open(file_name, "r") 
    data = my_file.read() 
    data_list = data.split('\n')
    my_file.close()
    return data_list

def if_en(s: str) -> bool:
    '''
    This functions returns boolean whether the main langugage of text is English or not
    '''
    return langid.classify(str(s))[0] == 'en'

def extract_languages(s: str, language_list: List[str]) -> List[str]:
    '''
    This functions returns list of languages that were mentioned in text
    in case of full resmblnce with one of the languages from a given list
    (case insensitive).
    '''
    lang_set = set(language.lower() for language in language_list)
    languages = set()
    for word in s.split():
        if word.lower() in lang_set:
            languages.add(word.capitalize())
    return list(languages)

def extract_skills (s: str, language_list: List) -> tuple:
    '''
    This function analyses bulk of text and returns lists of hard skills,
    soft skills and languages metntioned. Hard and soft skills (soft skills 
    excluding human languages)
    '''
    nlp = spacy.load("en_core_web_lg")
    # init skill extractor
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
    annotations = skill_extractor.annotate(s)
    soft_skills = set()
    hard_skills = set()
    matches = annotations['results']['full_matches']+annotations['results']['ngram_scored']
    for di in matches:
        skill_id = di['skill_id']
        skill_type = skill_extractor.skills_db[skill_id]['skill_type']
        skill_name = skill_extractor.skills_db[skill_id]['skill_name']
        if skill_type == 'Hard Skill':
            hard_skills.add(skill_name)
        if skill_type == 'Soft Skill' and not skill_name.count('Language'):
            soft_skills.add(skill_name)
    return (', '.join(list(soft_skills)), ', '.join(list(hard_skills)), ', '.join(extract_languages(s, language_list)))
    

In [None]:
#load the files
with open('job_parsing.yaml') as f:
    conf_dict = yaml.safe_load(f)
language_list = read_csv_to_list (conf_dict['LANGUAGE_PATH'])
df = pd.read_csv(conf_dict['TABLE_PATH'], usecols = conf_dict['COLS'].keys())

In [None]:
n_cols =df.shape[1]
df['header.jobTitle'] = df['header.jobTitle'].map(lambda x: x.lower())
df.insert(n_cols, 'IT_job', df['header.jobTitle'].str.contains('|'.join(conf_dict['TECH_KEYWORDS'])))
df.insert(n_cols, 'entry_job', ~df['header.jobTitle'].str.contains('|'.join(conf_dict['SENIOR_KEYWORDS'])))
df.insert(n_cols, 'Swiss_based', df["map.country"].map(lambda x: x in conf_dict['SWISS_LOCATION']))

df_Swiss_IT  = df.loc[(df['Swiss_based'] ==True) & (df['IT_job'] == True) & (df['entry_job'] == True)]
df_Swiss_IT.insert(n_cols, 'job_description', df_Swiss_IT['job.description'].map(lambda x: clean_from_html(str(x))))
df_Swiss_IT.insert(n_cols, 'english', df_Swiss_IT["job_description"].map(lambda x: if_en(str(x))))
df_Swiss_IT_en = df_Swiss_IT.loc[df_Swiss_IT['english'] ==True]
df_Swiss_IT_en.drop(columns=['english', 'job.description', 'IT_job', 'Swiss_based', 'entry_job'], inplace = True)
df_Swiss_IT_en.reset_index(drop = True, inplace = True)
df_Swiss_IT_en['skills'] = df_Swiss_IT_en["job_description"].map(lambda x: extract_skills(str(x), language_list))
df_Swiss_IT_en['soft_skills'] = df_Swiss_IT_en['skills'].map(lambda x: x[0])
df_Swiss_IT_en['hard_skills'] = df_Swiss_IT_en['skills'].map(lambda x: x[1])
df_Swiss_IT_en['languages'] = df_Swiss_IT_en['skills'].map(lambda x: x[2])

df_Swiss_IT_en.rename(mapper = conf_dict['COLS'], axis = 1, inplace = True)
df_Swiss_IT_en.drop(columns = ['skills'], inplace = True)
df_Swiss_IT_en.to_csv('IT_entry_swiss_jobs.csv')

import csv
fields = ['langugages']
with open('languages.csv', 'w') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in data_into_list:
        write.writerow([item])