In [1]:
import requests
from bs4 import BeautifulSoup
from numpy import nan
from dateutil import parser
from datetime import timedelta
import time

def normalise_time(time_str):
    today = parser.parse(time.asctime(time.localtime(time.time())))
    if time_str.find('30+ days ago') != -1:
        post_date = 'an unknown date more than 30 days ago'
    elif time_str.find('yesterday') + time_str.find('Yesterday') != -2:
        post_date = str(parser.parse(str(today - timedelta(days=1))))
    elif time_str.find('year') + time_str.find('Year') != -2:
        post_date = str(parser.parse(str(today - timedelta(days=int(time_str.split()[0]) * 365))))
    elif time_str.find('month') + time_str.find('Month') != -2:
        post_date = str(parser.parse(str(today - timedelta(days=int(time_str.split()[0]) * 30))))
    elif time_str.find('week') + time_str.find('Week') != -2:
        post_date = str(parser.parse(str(today - timedelta(weeks=int(time_str.split()[0])))))
    elif time_str.find('day') + time_str.find('Day') != -2:
        post_date = str(parser.parse(str(today - timedelta(days=int(time_str.split()[0])))))
    elif time_str.find('hour') + time_str.find('Hour') != -2:
        post_date = str(parser.parse(str(today - timedelta(hours=int(time_str.split()[0])))))
    elif time_str.find('minute') + time_str.find('Minute') != -2:
        post_date = str(parser.parse(str(today - timedelta(minutes=int(time_str.split()[0])))))
    elif time_str.find('second') + time_str.find('Second') != -2:
        post_date = str(parser.parse(str(today - timedelta(seconds=int(time_str.split()[0])))))
    else:
        try:
            post_date = str(parser.parse(time_str))
        except: 
            post_date = 'an unrecognisable date'
    return post_date

def real_check(str):
    check_list = ['journalist','reporter','editor','news']
    j = 0
    for i in check_list:
        j += str.find(i)
    if j == -len(check_list):
        return False

def scrape_careers_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_div = data.find_all('div', attrs={'class':"bti-ui-job-detail-container"}) 
    for i in all_div:
        title = i.find('a').text
        if real_check(title.lower()) == False:
            continue
        job = {}
        job['Title'] = title
        job['Company'] = i.find('div', attrs={'class':"bti-ui-job-result-detail-employer"}).text.strip()
        job['Location'] = i.find('div', attrs={'class':"bti-ui-job-result-detail-location"}).text.strip()
        job['Post_Date'] = normalise_time(i.find('div', attrs={'class':"bti-ui-job-result-detail-age"}).text.strip())
        job['URL'] = 'https://careers.journalists.org{}'.format(i.find('a')['href'])
        job['Current Status'] = nan
        job['Source'] = 'careers'
        job['Description'] = nan
        jobs.append(job)
    return jobs
def scrape_careers_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"bti-jd-description"}).text

def scrape_indeed_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_h2 = data.find_all('h2', attrs={'class':"jobtitle"}) 
    #it's a weirdo page that the last item's 'class' is different from above 9, so that we use its sub label h2.
    for i in all_h2:
        title = i.a['title']
        if real_check(title.lower()) == False:
            continue
        job = {}
        job['Title'] = title
        job['Company'] = i.parent.find('span', attrs={'class':"company"}).text.strip()
        #use back to the higher label
        job['Location'] = i.parent.find('span', attrs={'class':"location"}).text.strip()
        job['Post_Date'] = normalise_time(i.parent.find('span', attrs={'class':"date"}).text.strip())
        job['URL'] = 'https://www.indeed.com/viewjob?jk={}'.format(i['id'][3:])
        job['Current Status'] = nan
        job['Source'] = 'indeed'
        job['Description'] = nan
        jobs.append(job)
        #when the index url exceeds the range of pages in indeed, the page will become circulation, so that we should do duplicate checking
    return jobs

def scrape_indeed_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"jobsearch-JobComponent-description icl-u-xs-mt--md"}).text

def scrape_jobsdb_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_div = data.find_all('div', attrs={'class':"_3ASfTyv _2EUSthc"})
    for i in all_div:
        title = i.find('div', attrs={'class':"_3gfm7U9 _3ho-Knb _2swcdgn"}).a.text
        if real_check(title.lower()) == False:
            continue
        job={}
        job['Title'] = title
        job['Company'] = i.find('div', attrs={'class':"_1NdWRqw _3ho-Knb _2swcdgn"}).find('span').text
        job['Location'] = i.find('div', attrs={'class':"_124cxoK _3ho-Knb _2swcdgn"}).find('span').text
        job['Post_Date'] = normalise_time(i.find('span', attrs={'class':"JG37Vx2 _3Re95QG _2XGgj_O"}).find('span').text)
        job['URL'] = i.find('div', attrs={'class':"_3gfm7U9 _3ho-Knb _2swcdgn"}).a['href']
        job['Current Status'] = nan
        job['Source'] = 'jobsdb'
        job['Description'] = nan
        jobs.append(job)
    return jobs
    
def scrape_jobsdb_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"jobad-primary"}).text

def scrape_all_information(base_url,starting_index,step,function_scrape_information):
    global all_jobs
    all_jobs = []
    page_index = starting_index
    while True: 
        page_url = '{}{}'.format(base_url,page_index)
        try:
            jobs = function_scrape_information(page_url)
        except:
            jobs = []   
        if jobs == []: #if all jobs have been scraped, break the loop
            break
        all_jobs.extend(jobs)
        if len(all_jobs) > 999: 
            break
        page_index += step  #level is in url to indiccate different pages' index
#     all_jobs.sort(key=lambda item: item[3], reverse=False) 

def scrape_all_description(urls_and_sources):
    list_descriptions = []
    list_websites = [m["source"] for m in websites]
    list_functions = [i["function_scrape_description"] for i in websites]
    for u, s in urls_and_sources:
        list_descriptions.append(list_functions[list_websites.index(s)](u))
    return list_descriptions

websites = [
    {
        "source": 'careers',
        "base_url": 'https://careers.journalists.org/jobs/?keywords=journalist&page=',
        "starting_index": 1,        
        "step": 1,
        "function_scrape_information": scrape_careers_information,
        "function_scrape_description": scrape_careers_description,
    }, 
    {
        "source": 'indeed',
        "base_url": 'https://www.indeed.com/jobs?q=journalist&start=',
        "starting_index": 0,        
        "step": 10,
        "function_scrape_information": scrape_indeed_information,
        "function_scrape_description": scrape_indeed_description,
    },
    {
        "source": 'jobsdb',
        "base_url": 'https://hk.jobsdb.com/hk/search-jobs/journalist/',
        "starting_index": 1,        
        "step": 1,
        "function_scrape_information": scrape_jobsdb_information,
        "function_scrape_description": scrape_jobsdb_description,
    },
]  

In [2]:
%%time
import pandas as pd

header = ['Title','Company','Location','Post_Date','URL','Current Status','Source','Description']
df = pd.DataFrame(columns = header)
for i in websites:
    scrape_all_information(i["base_url"],i["starting_index"],i["step"],i["function_scrape_information"])
    try:
        df = df.append(all_jobs,ignore_index=True)
    except IndexError:
        pass
df = df.drop_duplicates(['URL']) #drop dupilictes according to URL
urls_and_sources = zip(df['URL'].tolist(), df['Source'].tolist())
df['Description'] = scrape_all_description(urls_and_sources)
df.to_csv('jobs.csv',na_rep='NaN')

CPU times: user 19.1 s, sys: 589 ms, total: 19.7 s
Wall time: 8min 32s


In [3]:
import csv
from string import punctuation
pd.set_option('max_rows',6000)
pd.set_option('max_colwidth',100)
punctuation += '\"“”‘’—-–'


def count_frequency(text):
    def kill_punctuations_capitals(text): 
        text = text.replace("’s","") #需要先去除‘s，否則去除標點會留下如chinas，trumps這樣的詞
        translator = str.maketrans("","",punctuation) 
        #等價於translator = str.maketrans(punctuation,len(punctuation)*' ')
        #note:str.maketrans(input,output,delete)，input,output長度必須相等
        list_lowercase_without_punctuation = text.lower().translate(translator).split()
        #將大寫字母轉化為小寫，去除標點，列出單詞
        return list_lowercase_without_punctuation


    def extract_meaningful(list): #去除無意義的單詞
        list_meaningful_words = []
        with open ('stopword.txt','r') as s:
            list_stop_words = s.read().split() #讀取stoplist
        for m in list:
            if m not in list_stop_words:
                list_meaningful_words.append(m)
        return list_meaningful_words

    def words_frequency(list): #統計一篇文章中的frequency
        dict_words_frequency={}
        for m in list:
            dict_words_frequency[m]=list.count(m)
        return dict_words_frequency
    return words_frequency(extract_meaningful(kill_punctuations_capitals(text)))


def update_dict(dict0,dict1): #兩個txt中的frequency相加
    for k,v in dict1.items():
        if dict0.__contains__(k):
            dict0[k] += v
        else:
            dict0.update({k : dict1[k]})

def rank_frequency(dict): #根據frequency排序，也可以最後用print(s.sort_values(ascending=False))，但是不方便寫cvs
    dict_frequency_rank={}
    rank = sorted(dict.items(), key=lambda item: item[1], reverse=True) 
    #將字典轉化爲二元數組，並根據字典中value排序
    for m in range(0,len(rank)):
        dict_frequency_rank.update({rank[m][0]:rank[m][1]})
    #再將字典重新整合起來
    return dict_frequency_rank

def write_csv(dict):
    with open('description-keywords-frequency.csv','w',newline='') as f:
        writer = csv.writer(f,delimiter=',')
        header = ['keyword','frequency']
        writer.writerow(header)
        writer.writerows(dict.items())

In [4]:
%%time
import csv
dict_accumulate_frequency = {} #所有文件中的frequency
dict_text_frequency = {} #單個文件中的frequency，方便日後分析每個文件
with open("jobs.csv",'r') as f:
    csv_reader = csv.reader(f)
    for i in csv_reader:
        frequency = count_frequency(i[8])
        dict_text_frequency[i[1]] = rank_frequency(frequency)
        update_dict(dict_accumulate_frequency,frequency) #累計到dict_accumulate_frequency
dict_frequency_rank = rank_frequency(dict_accumulate_frequency) #排序
write_csv(dict_frequency_rank)

CPU times: user 781 ms, sys: 14.8 ms, total: 796 ms
Wall time: 795 ms


In [5]:
import pandas as pd
df = pd.read_csv('description-keywords-frequency.csv').head(500)
# df.to_csv('description-keywords-frequency-rank500.csv')
df

Unnamed: 0,keyword,frequency
0,news,1366
1,work,741
2,media,716
3,experience,681
4,stories,663
5,content,655
6,skills,510
7,digital,503
8,ability,468
9,social,456
