In [1]:
import re
import bs4
import time
import string
import numpy as np
import pandas as pd
from textblob import TextBlob
from collections import defaultdict
from bs4 import BeautifulSoup as BS
from deep_translator import GoogleTranslator

In [2]:
df = pd.read_csv('dataframe_data steward.csv')

In [4]:
def remove_btags(bulletpoint):
    # remove any bold text from the bulletpoints
    try: 
        bulletpoint.b.unwrap()
    except:
        pass
    return bulletpoint 

In [5]:
def p_bullets(description, bulletpoint, str_bullet):
    # find <p> version of text in description
    string = description.find('p', text=bulletpoint.text)
    if string:
    # loop over all preceding p tags
        for p_string in string.find_all_previous('p'):
            p_string = f'<li>{p_string}</li>'
            # find the one that is not already in the bulletspoint
            if p_string not in str_bullet:
                head = re.sub('<[^<]+?>', '', p_string)
                break
    else:
        head = None
    return head

In [6]:
def normal_bullets(bulletpoint):
    # head of bulletpoints is previous <p> tag
    if not bulletpoint.find_previous('p'):
        head = None
    else:
        head = bulletpoint.find_previous('p').text.replace('\n', '')
        # if head is too long, it is normal text, not the head
        if len(head) > 60 or len(head) == 0:
            # check if head is h3, h2 or b
            try:
                head = bulletpoint.find_previous('h3').text.replace('\n', '')
            except:
                try:
                    head = bulletpoint.find_previous('h2').text.replace('\n', '')
                except:
                    try:
                        head = bulletpoint.find_previous('b').text.replace('\n', '')
                    except:
                        head = None
    return head

In [7]:
def br_bullets(bulletpoint):
    # When bulletpoints are inside <br> tags
    if bulletpoint.find_previous('br'):
        previous = [sibling.nextSibling for sibling in bulletpoint.find_previous('br').find_previous_siblings()]
        if len(previous) == 0:
            head = None
        else:
            if isinstance(previous[0], str):
                head = previous[0].replace('\n', '')
            else:
                head = None
    else:
        head = None
    
    return head

In [8]:
def translated(t):
    return GoogleTranslator(source='nl', target='en').translate(t.lower())

In [9]:
def bulletpoints_dict(dataframe):
    """ returns dictionary with bulletpoints of vacancies """
    all_bulletpoints = defaultdict(list)
    for index, description in enumerate(dataframe['description']):
        bulletpoints = BS(description, 'html.parser').findAll('li')
        bs4_description = BS(description, 'html.parser')

        str_bullets = list(map(str, bulletpoints))
        # if there are no bulletpoints:
        if len(bulletpoints) == 0:
            pass
        else:
            # detect language
            lang = TextBlob(bulletpoints[0].text[:100]).detect_language()
             # for each bulletpoint
            for item in bulletpoints:
                # remove bold text
                item = remove_btags(item)
                # check for <p> case
                if '<p>' in str(item):
                    head = p_bullets(bs4_description, item, str_bullets)
                # if only br text
                elif item.find_previous('p') == None:
                    head = br_bullets(item)
                else: 
                    head = normal_bullets(item)
                # add head as key and bulletpoint as value
                if head == None:
                     head = 'no head'
                else:
                    # if head is too long, skip it
                    if len(head) > 60:
                        pass
                    else:
                        # if language is dutch, translate to english
                        if lang == 'nl':
                            head = translated(head)
                            point = translated(item.text.replace('\n', ''))
                        else:
                            head = head.lower()
                            point = item.text.replace('\n', '').lower()

                        headlist = (dataframe.iloc[index]['id'], head)
                        all_bulletpoints[headlist].append(point)
                        
    return all_bulletpoints

In [10]:
all_bullet_dict = bulletpoints_dict(df)

In [11]:
green = ['you have', 
         'you’ll have',
         'you need',
         'you’ll need',
         'you are',
         'you offer',
         'you bring',
         'about you',
         'we ask', 
         'we expect', 
         'require', 
         'skills', 
         'knowledge', 
         'experience', 
         'education',
         'candidate', 
         'profile',
         'looking for',
         'qualification',
         'recognize yourself']

In [12]:
def requirement(item, green):
    res = False
    if item:
        if any(word in item for word in green):
            res = True
    return res

In [13]:
def select_requirements(dictionary, green_list):
    """ select requirement based on green list """
    d = {k: v for k, v in dictionary.items() if requirement(k[1], green)}
    dict2 = d.copy()
    m = []
    for r in dict2.keys():
        if len(m) > 0 and r[0] in np.array(m)[:,0]:
            first = next(item for item in m if item[0] == r[0])
            dict2[first].extend(dict2[r])
            continue
        else:
            pass
        m.append(r)
        
    for key in d.keys():
        if key not in m:
            del dict2[key]
            
    return dict2

In [14]:
req = select_requirements(all_bullet_dict, green)

{('DSP_0',
  'candidate profile:'): ['at least a bachelor’s degree from an accredited university or college in computer science (or in a similar field);', 'relevant work experience as a master data analyst or in related field.', 'able to work with stakeholders to assess potential risks.', 'able to analyze existing tools and databases and provide necessary software solution recommendations.', 'familiar with master data, data transfer workbench, intercompany functionality in sap business one will be an advantage', 'you are experienced with sql and relational database;', 'you are da-100 powerbi certified;', 'ability to translate business requirements into non-technical, lay terms.', 'high-level experience in methodologies and processes for managing large scale databases.', 'demonstrated experience in handling large data sets and relational databases.', 'understanding of addressing master and metadata standards.', 'high-level written and verbal communication skills.', 'fluent in english an