## Import the packages

In [1]:
# import packages
import requests, json
import pandas as pd, numpy as np
import time, os
from bs4 import BeautifulSoup
import re
from datetime import datetime



## Parsing the data

The following steps parse the scraped html

### The first cell defines our parsing functions

In [2]:
def prepros(html) :
    d=json.loads(html)
    
    soup = BeautifulSoup(d['result_list_box_html'],'lxml')
    
    # this selects the main part of the html
    tabel = soup.find('div', attrs={'class':'results component--default'})
    
    # Jobindex contain both own postings and postings from other
    # the two are treated differently in the data
    # Jobindex' imported postings
    others = re.compile('r[\d]+')
    tabel1 = tabel.findAll('div', attrs={'data-beacon-tid':others})
#    print(tabel1)

    # Jobindex' own postings
    own = re.compile('h[\d]+')
    tabel2 = tabel.findAll('div', attrs={'data-beacon-tid':own})
    #print(tabel2)
    
    return tabel, tabel1, tabel2

def job_descr_own(tabel2) : # den er færdig (næsten, der kommer lidt snask med, men det er ok)
    desc_own = []
    lineshift = re.compile('\n')
    # tag ul skal med 
    for l in tabel2 :
        g1 = l.findAll('p') 
        g2 = l.findAll('li')
        qs = ''
#        print(g)
        for m in g1 :
            qs = qs + m.text + ". " 
        for m in g2 :
            if 'class=' not in m.text :
                qs = qs + m.text + ". " 
        desc_own.append(lineshift.sub("", qs))
    return desc_own

def job_descr_oth(tabel1) : # færdig og virker
    desc_oth = []
    p=re.compile('"')
    lineshift = re.compile('\n')
    besk = []
    for t_ in tabel1 :
        besk.append(lineshift.sub("", t_.text))
    for t in besk :
        s = t.split(sep='    ')
        if len(s) == 1 :
            desc_oth.append("")
        else :
            desc_oth.append(t.split(sep='    ')[1])
    return desc_oth
    
def job_title_oth (tabel) : # færdig og virker
    # udled jobs andre
    jobs_oth = []
    j = tabel.findAll('strong') 
    for l in j :
        jobs_oth.append(l.text)
    return jobs_oth

def job_title_own(tabel2) :
    a = []
    for l in tabel2 :
        try :
            a.append(l.findAll('b')[0].text)
        except :
            print("Der er ikke nogen stillingsbetegnelse", '\n')
            print(l)
            a.append('Ikke fundet')
    return a
    
def firm_place(tabel) : # ok
    firm=[]
    city=[]
    for tag in tabel :
        firm_city=tag.findAll('b')
        if len(firm_city) == 0 :
            firm.append("Ukendt")
            city.append("Uoplyst")
        elif len(firm_city) == 1 :
            firm.append(firm_city[0].text)
            city.append(" ")
        else :
            firm.append(firm_city[0].text)
            city.append(firm_city[1].text)
    return firm, city


def dates(tabel) : # ok
    indented_d=[]
    for tag in tabel :
        dato_site=tag.findAll('time')

        monthval={'januar': 1,'februar': 2, 'marts': 3, 'april': 4, 'maj': 5, 'juni': 6, \
                  'juli': 7,'august': 8,'september': 9,'oktober': 10,'november': 11,'december': 12}

        for i in dato_site:
            t = i.text.split()
#            try :
#            datotal=(t[0][:-1]+ monthval.get(t[1])+ t[2])
            indented_d.append(datetime(int(t[2]), int(monthval.get(t[1])), int(t[0][:-1])))
#                indented_d.append(time.mktime(datetime.strptime(str(datotal), '%d%m%Y').timetuple()))
#            except :
#                indented_d.append(time.time()) # vi bør videreføre sidst kendte værdi også på tværs af sider
    return indented_d

def firms_own_fct(tabel2):
    firms_own = []
    for tag in tabel2 :
        firms_egne=tag('img')
        regex = re.compile('alt="(.*?)" (?!border: 0px; margin: 0)')
        firms_own_=regex.findall(str(firms_egne))
        if len(firms_own_) == 0 :
            firms_own_ = 'Ukendt'
        firms_own.append(firms_own_[0])
    return firms_own


def cities_own(tabel2) :
    city_own = []
    for tag in tabel2 :
        cit = tag('p')
        regex = re.compile('</a>, (.+?)\s*?</p>')
        cities = regex.findall(str(cit))
        if len(cities) == 0 :
            cities = ['Uoplyst']
        city_own.append(cities[0])
    return city_own



### To test the parsing, we select a subsample

In [15]:
# select small subsample
import random, time

ca_samplesize = 200
#base_path = r"C:/Notebooks/jobindex.txt"
base_path = r"C:\Users\pot\Documents\GitHub/jobindex.txt"
#sample_path = r"C:/Notebooks/jobindex_sample.txt"
sample_path = r"C:\Users\pot\Documents\GitHub/jobindex_sample.txt"
f = open(base_path,'r')
s = open(sample_path, 'w')
linienr = 0
t0 = time.time()
for line in f :
    ran = random.uniform(0, 1) 
    if ran < ca_samplesize/17000 :
        linienr += 1
        s.write(line)
f.close()
t1 = time.time()
print("Start:", t0, ". Slut:", t1, ". Linier hentet:", linienr)
print("Kørseltid:", round(t1-t0,2), "sekunder")




Start: 1535401974.650813 . Slut: 1535401984.9324229 . Linier hentet: 182
Kørseltid: 10.28 sekunder


### The function that governs the parsing and concatanate the results

In [5]:

def process_data() :
    t0 = time.time()
    base_path = r"C:\notebooks\jobindex4.txt" # file with scraped jobindex data
#    base_path = r"C:\Users\pot\Documents\GitHub\jobindex2.txt" # file with scraped jobindex data
    

    f = open(base_path, mode='r', encoding='utf8') # open the file for reading # jobindex2 skal læses med , encoding='utf8'
    count = 0
    # loop through the file one line at a time

    output_oth = pd.DataFrame(columns=['date', 'job_title', 'job_describ', 'city', 'company'])
    output_own = pd.DataFrame(columns=['date', 'job_title', 'job_describ', 'city', 'company'])
    
   
    for html in f :
        count += 1
#        print(count, len(html))
        if len(html) == 1 :
            continue
        # here the various processing functions will be called
        tabel, tabel1, tabel2= prepros(html)
#        print(len(tabel))

        desc_own = job_descr_own(tabel2)
#        print("desc_own", '\n', len(desc_own), '\n', desc_own)

        desc_oth = job_descr_oth(tabel1)
#        print('desc_oth', len(desc_oth), desc_oth)
        
        jobs_oth = job_title_oth(tabel)
#        print('jobs_oth', len(jobs_oth), jobs_oth)
        
        firm_oth, city_oth = firm_place(tabel1)
#        print("Andre firms:", len(firm_oth))
        
        post_dates_oth = dates(tabel1)
#        print("dates", len(post_dates_oth))

        post_dates_own = dates(tabel2)
#        print("dates", len(post_dates_own), post_dates_own)

        firms_own = firms_own_fct(tabel2)
#        print("Egne firms:", len(firms_own), '\n', firms_own)

        city_own = cities_own(tabel2)
#        print("Egne city:", len(city_own), '\n', city_own)
#        print("\n")

        jobs_own = job_title_own(tabel2)
#        print("Egne jobtitler:", len(jobs_own), '\n', jobs_own)
#        print("\n")

    # der skal laves en zip-funktion, så de kan blive sat rigtig sammen
#    return desc_own, desc_oth, jobs_oth, firm, city, post_dates
        tmp = pd.DataFrame(list(zip(post_dates_oth, jobs_oth, desc_oth, city_oth, firm_oth)), 
                           columns=['date', 'job_title', 'job_describ', 'city', 'company'])
        output_oth = pd.concat([output_oth, tmp], axis=0, join='inner', ignore_index=True)
 
#        print("Dato:", '\n', post_dates_own, '\n')
#        print("Beskrivelse:", '\n', desc_own, '\n')
#        print("Byer:", '\n', city_own, '\n')
#        print("Firma:", '\n', firms_own, '\n')
    

        tmp2 = pd.DataFrame(list(zip(post_dates_own, jobs_own, desc_own, city_own, firms_own)),
                           columns=['date', 'job_title', 'job_describ', 'city', 'company'])
        output_own = pd.concat([output_own, tmp2], axis=0, join='inner', ignore_index=True)
    f.close()
    t1 = time.time()
    print("Procestid er", int((t1-t0)/60), "minutter og", round((t1-t0)%60, 2), "sekunder")
    #print(output_oth)
    return output_oth, output_own

output_oth, output_own = process_data()

#Procestid er 17 minutter og 59.43 sekunder for both1
#Procestid er 6 minutter og 31.98 sekunder for both2

Procestid er 14 minutter og 31.07 sekunder


### Saves the outcome with pandas pickle

In [6]:
# gem data

print(output_oth.shape)
print(output_own.shape)

df_both = pd.concat([output_oth, output_own], axis=0, join='outer', ignore_index=True)
print(df_both.shape)
df_both.to_pickle('both4.pkl')

#print(df_both.tail(15))

(136062, 5)
(36718, 5)
(172780, 5)


### Procedure to read the pandas pickle

In [60]:
# læs data 
ny1 = pd.read_pickle('both1.pkl')
ny2 = pd.read_pickle('both2.pkl')

df_both = pd.concat([ny1, ny2], axis=0, join='inner', ignore_index=True)

print("Shape", df_both.shape)
print("\n")
print(df_both.head(1), "\n", df_both.tail(2))



Shape (471820, 5)


        date                                          job_title  \
0 2009-12-31  ABB - Salgsingeniør / Lavspændingskomponenter ...   

                                         job_describ                city  \
0  Til vores krævende og kompetente kunder som be...  Region Midtjylland   

   company  
0  ABB A/S   
              date                   job_title  \
471818 2008-01-01  Løn- og Økonomimedarbejder   
471819 2008-01-01        Regnskabsmedarbejder   

                                              job_describ     city  \
471818  Strålfors Information Logistics A/S, Brøndby ....  Brøndby   
471819  Entreprenørfirmaet Per Jørgensen ApS, Årslev ....  Uoplyst   

                                     company  
471818   Strålfors Information Logistics A/S  
471819  Entreprenørfirmaet Per Jørgensen ApS  


In [47]:
#for dates in pd.unique(df_both.date) :
#    print(dates)

cities = {}
for obs in df_both.city :
    cities[obs] += 1

companies = {}
for obs in df_both.company :
    companies[obs] += 1


KeyError: 'Region Midtjylland'