In [None]:
# v1.1

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

#Configuration class
class Config(object):
    site = 'https://www.work.ua'
    job_position = 'data+scientist'
    n_pages = 50
    html_elements = ['card card-hover card-visited wordwrap job-link',
                     'card card-hover card-visited wordwrap job-link js-hot-block' ]

def get_html(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}
        page_response = requests.get(url, headers = headers)
        return BeautifulSoup(page_response.content, "html.parser")
    except requests.exceptions.RequestException as req_er:
        print('Request error: ',req_er)

        

#load the last dataset if it exists
try:
    latest_dataset = pd.read_csv('vacansies_dataset.csv')
    latest_dataset_pd = latest_dataset[['id', 'title', 'time', 'requirements']].set_index('id')
    existing_ids = latest_dataset.id
    print('Number of existing vacancies:', len(existing_ids))
    
except FileNotFoundError:
    print('There is no existing dataset')
    latest_dataset_pd = pd.DataFrame()
    existing_ids = pd.Series()
    
    
vacancies = []
vac_dict = {}

for p in range(1,Config.n_pages):
    # Get html content
    page_content = get_html(Config.site+'/jobs-'+Config.job_position+'/?page='+str(p))
    # Parse job titles using html elements
    items = []
    for html_element in Config.html_elements:
        items = items + page_content.find_all('div', {'class': html_element})

    #get position details
    for item in items:
        job_link = item.find('h2').find('a').get('href')
        job_title_full = item.find('h2').find('a').get('title')
        job_title = re.findall(r'(.*)\,\sвакансія\sвід.*', job_title_full)[0]
        #parse job_link to get id
        job_id = re.match(r'\/jobs\/(\d{7})', job_link)
        if job_id:
            job_id = job_id.group(1)
            #Check if the job_id already exist in dataset
            if int(job_id) in existing_ids.values:
                break
        else:
            continue
        #parse job title to get date
        month_ua = {'січня': 1, 'лютого': 2, 'березня': 3, 'квітня': 4, 'травня': 5, 'червня': 6, 'липня': 7,
                    'серпня': 8, 'вересня': 9, 'жовтня': 10, 'листопада': 11, 'грудня': 12}
        job_date = re.match(r'.*вакансія\sвід\s(\d{1,2})\s(\w*)\s(\d{4})', job_title_full).groups()
        job_date_p = str(month_ua[job_date[1]])+'/'+job_date[0]+'/'+job_date[2]

        # Get html page from job position page
        page_content_job = get_html(Config.site+'/jobs/' + job_id + '/')
        
        # get job requirements and save to dictionary
        cont = page_content_job.select("div.card.wordwrap ul")
        if len(cont)>1:
            requirements = cont[1].get_text()
            vac_dict.update({job_id:[job_title, job_date, job_date_p, requirements]})

        else:
            cont = page_content_job.select("div.card.wordwrap p")
            requirements = []
            for t in cont[2:]:
                item = t.get_text()
                requirements = requirements + [item]
                vac_dict.update({job_id:[job_title, job_date, job_date_p, requirements]})
    
# Import vacancies dictionary to pandas dataframe    
vac_dict_pr = {'id': list(vac_dict.keys()), 
               'title': [ i[0] for i in list(vac_dict.values()) ], 
               'time': [i[2] for i in list(vac_dict.values())],
              'requirements': [str(i[3]) for i in list(vac_dict.values())]}
vac_df = pd.DataFrame(vac_dict_pr).set_index('id')

print('Number of new vacancies:', len(vac_df))
    
    
#Append new data to dataset
new_dataset = vac_df.append(latest_dataset_pd)

# Save pandas df to csv in local folder
new_dataset.to_csv('vacansies_dataset.csv')

In [None]:
#Analyse requirements 
#Words frequency

import string
import pandas as pd
from bs4 import BeautifulSoup # For HTML parsing
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from stop_words import get_stop_words # Filter out stopwords, such as 'the', 'or', 'and'

# load latest dataset
try:
    latest_dataset = pd.read_csv('vacansies_dataset.csv')
    latest_dataset_pd = latest_dataset[['id', 'title', 'time', 'requirements']].set_index('id')
    existing_ids = latest_dataset.id
    print('Number of existing vacancies:', len(existing_ids))
    
except FileNotFoundError:
    print('There is no existing dataset')
    latest_dataset_pd = pd.DataFrame()
    existing_ids = pd.Series()

req_words = []

# clean data and tokenization
for r in latest_dataset_pd['requirements']:
    #print(r)
    text = re.sub("[^a-zA-Zа-яА-ЯіІїЇ.+3]"," ", r).split() # get rid of any terms that aren't words 
    
    table = str.maketrans('', '', string.punctuation) # create a mapping table for 
    stripped = [w.translate(table) for w in text] # Remove Punctuation
    
    words = [word.lower() for word in stripped] # Go to lower case
    
    stop_words = set(get_stop_words('ukrainian') + get_stop_words('russian') + get_stop_words('english')) # Filter out any stop words
    cleaned_words = set([w for w in words if not w in stop_words])
    
    req_words = req_words + list(cleaned_words)
    
#count words
req_words_freq = {}

for word in req_words:
    if word in req_words_freq:
        req_words_freq[word] = req_words_freq[word] + 1
    else:
        req_words_freq[word] = 1
        
req_words_freq_pd = pd.Series(req_words_freq)

# Filter out not important words
not_important_words = ['na', 'xa', 'понимание', 'работа', 'умение',  '', '3', 'tools', 'processing', 'work',
                       'good', 'language', 'plus', 'бажано', 'роботи', 'using', 'команде', 'высокие', 'др',
                       'способность', 'работать', 'возможно', 'данных', 'практические', 'навыки', 'использованием',
                       'различными', 'пр', 'подходов', 'принципов', 'построения', 'data', 'experience', 'skills', 
                       'опыт', 'strong', 'knowledge', 'etc', 'science', 'работы', 'understanding', 'ability', 
                       'знание', 'досвід', 'years', 'знання', 'related']

req_words_freq_pd = req_words_freq_pd.reset_index(name='count')

#show top 30
req_words_freq_pd[~req_words_freq_pd['index'].isin(not_important_words)].sort_values('count' ,ascending=False)[:30]