In [None]:
!pip install pyyaml

In [None]:
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import re
import langid
import yaml
from typing import List

import spacy
from spacy.matcher import PhraseMatcher
# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor

In [None]:
def clean_from_html (s: str) -> str:
    bs_result = BeautifulSoup(s, 'lxml').text
    result = re.sub('\n', ' ', bs_result)
    return result

def if_en(s: str) -> bool:
    return langid.classify(str(s))[0] == 'en'

def extract_skills (s: str) -> List[str]:
    nlp = spacy.load("en_core_web_lg")
    # init skill extractor
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
    annotations = skill_extractor.annotate(s)
    skills = []
    matches = annotations['results']['full_matches']+annotations['results']['ngram_scored']
    for di in matches:
        skills.append(di['doc_node_value'])
    return skills
    

In [None]:
with open('job_parsing.yaml') as f:
    conf_dict = yaml.safe_load(f)

In [None]:
df= pd.read_csv(conf_dict['FILE_PATH'], usecols = conf_dict['COLS'].keys())

In [None]:
df['header.jobTitle'] = df['header.jobTitle'].map(lambda x: x.lower())
df['entry_IT_job'] = df['header.jobTitle'].str.contains('|'.join(conf_dict['TECH_KEYWORDS']))
df['entry_IT_job'] = ~df['header.jobTitle'].str.contains('|'.join(conf_dict['SENIOR_KEYWORDS']))
df['Swiss_based'] = df["map.country"].map(lambda x: x in conf_dict['SWISS_LOCATION'])

df_Swiss_IT  = df.loc[(df['Swiss_based'] ==True) & (df['entry_IT_job'] == True)]
df_Swiss_IT['job_description'] = df_Swiss_IT['job.description'].map(lambda x: clean_from_html(str(x)))
df_Swiss_IT.drop(columns=['job.description', 'entry_IT_job', 'Swiss_based'], inplace = True)
df_Swiss_IT['english'] = df_Swiss_IT["job_description"].map(lambda x: if_en(str(x)))
df_Swiss_IT_en = df_Swiss_IT.loc[df_Swiss_IT['english'] ==True]
df_Swiss_IT_en.drop(columns=['english'], inplace = True)
#df_Swiss_IT_en['skills'] = df_Swiss_IT["job.description_cleaned"].map(lambda x: extract_skills(str(x)))
#df_Swiss_IT_en.reset_index(drop = True, inplace = True)


In [None]:
df_Swiss_IT_en_200 = df_Swiss_IT_en.iloc[:200,:]

In [None]:
df_Swiss_IT_en_200.columns

In [None]:
df_Swiss_IT_en_200['skills'] = df_Swiss_IT_en_200["job_description"].map(lambda x: extract_skills(str(x)))

In [None]:
df_Swiss_IT_en_200.rename(mapper = conf_dict['COLS'], axis = 1, inplace = True)

In [None]:
df_Swiss_IT_en_200.to_csv('IT_entry_swiss_jobs_200.csv')