# ISDS 2022 Notebook

This notebook contains all relevant code for group 4 in the ISDS class of 2022. It belongs thereby to Nicolai, Mads, Elena and Emilie.

In [None]:
# Setting import and downloads
# This section can be skipped if all relevant packages have been downloaded.

%pip install selenium
%pip install webdriver_manager
%python -m nltk.downloader popular
%pip install tensorflow
%pip install spacy
%pip install wordloud
%python -m spacy download da_core_news_md
%pip install keras
%pip install sentence_transformers


In [None]:
# Import magics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path
import re
import requests
from bs4 import BeautifulSoup
from io import StringIO
import time
from queue import Empty

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk import FreqDist
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
import spacy
from spacy import displacy

from sentence_transformers import SentenceTransformer

import string
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import lemmy

from function import *

In [None]:
# Importing stopwords and setting lemmatizer, seed
with open('stopord.txt', encoding='utf8') as f:
    stopord = f.read().splitlines()
stop_word = nltk.corpus.stopwords.words('danish') + stopord

nlp = spacy.load('da_core_news_md')
lemmatizer = lemmy.load("da")
sns.set_style("whitegrid")
seed = 42
AUTOTUNE = tf.data.AUTOTUNE

# Functions

In [None]:
def extract_jobindex(tag, pages):
    """
    This function seeks to scrap and find all link of relevant job postings
    """
    flat_list = []
    url_list = []
    total_pages = range(pages)
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    for page in total_pages:
        url = f"https://www.jobindex.dk/jobsoegning?maxdate=20220101&mindate=20210101&page={page}&jobage=archive&q={tag}"
        r = requests.get(url, headers)
        time.sleep(0.5)
        soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser") 
        divs = soup.find_all("div", class_="jobsearch-result")
        for item in divs:
            try:
                job_url = item.select_one('[data-click*="u="]:has(> b)')['data-click']
                url_list.append(job_url)
            except:
                pass
        for i in url_list:
            link = 'http://www.jobindex.dk' + i
            flat_list.append(link)
    return flat_list

def fetching_occupation(uri):
    """
    This function fetch occupations for ESCO and append them in a dataframe
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = f'http://ec.europa.eu/esco/api/resource/occupation?uri={uri}&language=da'
    response = requests.get(url, headers)
    time.sleep(0.5)
    result = response.json()
    job_title = result['title']
    joblist = []
    for i in range(1000):
        try:
            essential_skill = result['_links']['hasEssentialSkill'][i]['title']
            optional_skill = result['_links']['hasOptionalSkill'][i]['title']
        except:
            break
        job = {
        'job_title' : job_title,
        'essential_skill': essential_skill,
        'optional_skill' : optional_skill
        }   
        joblist.append(job)
        jobs = pd.DataFrame(data=joblist, columns=['job_title', 'essential_skill', 'optional_skill'])
    return jobs

def UG(page, tag):
    """
    This function find link to the relevant educations on Uddannelsesguiden.dk and append the links in a list
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = f"https://www.ug.dk/search/{tag}?page={page}"
    r = requests.get(url, headers)
    time.sleep(0.5)
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
    divs = soup.find_all("div", class_="node node-uddannelse node-teaser clearfix")
    list_of_articles = []
    for i in range(len(divs)):
        list_of_articles.append(divs[i].find('a')['href'])
    list_of_articles_final = []
    for link in list_of_articles:
        if '/kandidatuddannelser/' in link:
            list_of_articles_final.append(link)

    return list_of_articles_final

def clean_text(text):
    """
    This function clean all text files and replaces the words
    """
    # text.split(' ')
    # text = text.replace('[','').replace(']', '')
    text = text.lower()
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$|[^a-zA-ZæøåÆøå]+|(<li>)|(</li>)|</p>|<p>|\xa0|<ul>|</ul>|'|-| +|,| ,|</a>|<a>|<a|[.]|[:]|\n|[?]", " ", 
    text)
    # text = re.sub("[^a-zA-Z0-9 -]", '', text)
    # text = re.sub("(<li>)|(</li>)|</p>|<p>|\xa0|<ul>|</ul>|'|-| +|,| ,|</a>|<a>|<a|[.]|[:]|\n|[?]", '', text)
    text = re.sub(" +", " ", text)
    return text
    
def extract_UG(link):
    """
    This function find relevant text on Uddannelsesguiden.dk and append the text in a list
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = 'http://ug.dk' + link
    r = requests.get(url, headers)
    time.sleep(0.5)
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")

    divs_intro = soup.find_all("div", class_="region region-content")
    for item in divs_intro:
        intro = item.find_all("p")[0:6]
        indhold = item.find_all('ul')[0:1]
        outro = item.find_all('p')[19:22]
    
    text = ' '.join([str(i) for i in intro]) + ' '.join([str(i) for i in indhold]) + ' '.join([str(i) for i in outro])
    text = clean_text(text)
    return text

def write_text(list_name, list_):
    """
    This function takes a list as an input and saves the file in a txt format
    """
    with open(f"{list_name}.txt", "w", encoding='utf-8') as fp:
        for item in list_:
            fp.write("%s " % item)
    fp.close() 

def extract_ku(fag):
    """
    This function scrapes relevant information from the wesite of university of copenhagen
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = f"https://studier.ku.dk/kandidat/{fag}/faglig-profil-og-job/"
    r = requests.get(url, headers)
    time.sleep(0.5)
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
    
    return soup

def transform_ku(soup):
    """
    This function transforms relevant information from the wesite of university of copenhagen
    """
    divs = soup.find_all("div", class_="col-xs-12 col-sm-8 col-md-6 main-content")
    text_ku =[]
    for i in range(len(divs)):
        text_ku.append(divs[i].find_all("p"))

    divs2 = soup.find_all("div", class_="col-xs-12 col-sm-8 col-md-6 main-content")

    for i in range(len(divs2)):
        text_ku.append(divs2[i].find_all("ul"))
    
    return text_ku

def extract_au(fag):
    """
    This function transforms relevant information from the wesite of university of aarhus
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = f"https://bachelor.au.dk/{fag}"
    r = requests.get(url, headers)
    time.sleep(0.5)
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
    
    return soup

def extract_aau(fag):
    """
    This function transforms relevant information from the wesite of university of aalborg
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    url = f"https://www.aau.dk/uddannelser/kandidat/{fag}"
    r = requests.get(url, headers)
    time.sleep(0.5)
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
    
    return soup

def make_a_list(series):
    """
    This function turn a series from af dataframe into a list, and cleans the data
    """
    new_list = []
    list1 = series.to_list()
    for item in list1:
        new_list.append(clean_text(item))
    new_final_df = pd.DataFrame(data=new_list)

    return new_final_df

def generate_better_wordcloud(data, size):
    """
    This function makes a wordcloud plot
    """
    cloud = WordCloud(scale=3,
                      max_words=None, #Maximum words in the WordCloud
                      colormap='ocean', #Color of the WordCloud
                      background_color='white',
                      max_font_size=95,
                      mask=None,
                      relative_scaling=0.5,
                      stopwords=stop_word, #Setting StopWords equal to the updated
                      collocations=False).generate(data)
    plt.figure(figsize=size)
    plt.imshow(cloud)
    plt.axis('off') #No axis 
    plt.show()

def frequency_plot(data):
    """
    This function makes a frequency plot
    """
    plot = FreqDist(data).most_common(20)
    all_fdist = pd.Series(dict(plot))
    fig, ax = plt.subplots(figsize=(10,10))
    sns.barplot(x=all_fdist.values, y=all_fdist.index, ax=ax, color='navy', ci=None)
    plt.ylabel('Words', fontsize=14)
    plt.xlabel('Count', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=16) #rotation=30
    plt.show()

def lemmatize_text(input):
    """
    This function lemmatizes a string and appends the first apperance of the lemmatized string
    """
    list_ = []
    func = [lemmatizer.lemmatize('', i) for i in input]
    for sublist in func:
        list_.append(sublist[:1])

    return list_

def clean_jobindex(search_word):
    """
    This function takes the links found in extract jobindex and finds all links in the jobindex.dk archive function.
    Afterwards this function import all text from the first 4 pages of jobpostings and lemmatizes, cleans and removes stopwords.
    """
    headers = {'User-Agent':'kjp538@alumni.ku.dk', 'Name':'Nicolai Bernsen'}
    final_job = []
    jobindex = []
    info_job = [] 
    flat_list = []
    final_list = []
    job_list = []

    job = extract_jobindex(search_word, 4)
    
    for i in job:
        url = f"{i}"
        r = requests.get(url, headers)
        time.sleep(0.5)
        soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
        iframe = soup.find_all('iframe', class_='archive-content')
        for i in iframe:
            link = i['src']
            info_job.append(link)

    for item in info_job:
        links = 'http://www.jobindexarkiv.dk/cgi/showarchive.cgi' + item
        jobindex.append(links)

    for i in jobindex:
        url_ = f"{i}"
        r = requests.get(url_, headers)
        time.sleep(0.5)
        soup = BeautifulSoup(r.content, "html.parser")
        content = soup.find_all('body')
        for i in content:
            text = i.get_text()
            flat_list.append(text)

    job_ = ' '.join(str(i) for i in flat_list)
    _job = nltk.word_tokenize(clean_text(job_))
    job__ = lemmatize_text(_job)
    for sublist in job__:
        job_list.append(' '.join(sublist))

    for word in job_list:
        if word not in stop_word:
            final_job.append(word)

    return final_job

def find_search_words(df, type_):
    """
    This function find similarities between ESCO skills and boss-words in the Jobindex and Education lists
    """
    search_words = []
    job_list = []
    final_list = []
    searchword = df.loc[jobs['job_title'] == f'{type_}']
    for value in searchword['optional_skill']:
        search_words.append(value)
    for value in searchword['essential_skill']:
        search_words.append(value)
    res = ' '.join(search_words)
    res = nltk.word_tokenize(clean_text(res))
    res_ = lemmatize_text(res)
    for sublist in res_:
        job_list.append(' '.join(sublist))

    for word in job_list:
        if word not in stop_word:
            final_list.append(word)

    result = []
    for i in final_list:
        [result.append(i) for i in final_list if i not in result]

    return result

## Scraping and cleaning Jobindex data

In [None]:
jobindex_psych = clean_jobindex('cand.psych')
jobindex_oecon = clean_jobindex('cand.oecon')
jobindex_pol = clean_jobindex('cand.scient.pol')
jobindex_anth = clean_jobindex('cand.scient.anth')
jobindex_soc = clean_jobindex('cand.scient.soc')

jobindex_psych_full = ' '.join(jobindex_psych)
jobindex_oecon_full = ' '.join(jobindex_oecon)
jobindex_pol_full = ' '.join(jobindex_pol)
jobindex_anth_full = ' '.join(jobindex_anth)
jobindex_soc_full = ' '.join(jobindex_soc)

print(f'Number of word in \n psych: {len(jobindex_psych_full)}, \n oecon: {len(jobindex_oecon_full)},\n \
pol: {len(jobindex_pol_full)}, \n anth: {len(jobindex_anth_full)}, \n soc: {len(jobindex_soc_full)}')

### Plotting Jobindex data using Wordcloud

In [None]:
generate_better_wordcloud(jobindex_oecon_full, (10,8))

In [None]:
generate_better_wordcloud(jobindex_psych_full, (10,8))

In [None]:
generate_better_wordcloud(jobindex_pol_full, (10,8))

In [None]:
generate_better_wordcloud(jobindex_anth_full, (10,8))

In [None]:
generate_better_wordcloud(jobindex_soc_full, (10,8))

## Importing skills from ESCO

In [None]:
occupations = ['http://data.europa.eu/esco/occupation/99492920-e5a5-4dba-9e5a-93193147198c', 
'http://data.europa.eu/esco/occupation/11df8941-508c-4103-ad40-52cdf9430a59', 
'http://data.europa.eu/esco/occupation/acf69cab-8629-45c8-ae10-c8fb15f474b6', 
'http://data.europa.eu/esco/occupation/52ded7d7-11df-42e3-b90a-d7f4b70fb4b9',
'http://data.europa.eu/esco/occupation/4f89b0d2-b666-4890-af01-25d1d60da1f3']

jobs = pd.DataFrame(columns=['job_title', 'essential_skill', 'optional_skill'])

for i in occupations:
    jobs = jobs.append(fetching_occupation(i))

jobs = jobs.apply(lambda x: x.replace({'økonom':'cand.oecon', 'psykolog':'cand.psych', 'antropolog':'cand.scient.anth', 
'politolog':'cand.scient.pol', 'sociolog':'cand.scient.soc'}, regex=True))

In [None]:
esco_oecon = find_search_words(jobs, 'cand.oecon')
esco_psych = find_search_words(jobs, 'cand.psych')
esco_pol = find_search_words(jobs, 'cand.scient.pol')
esco_anth = find_search_words(jobs, 'cand.scient.anth')
esco_soc = find_search_words(jobs, 'cand.scient.soc')

print(f'Number of skills in: \n psych: {len(esco_psych)}, \n oecon: {len(esco_oecon)}, \
    \n pol: {len(esco_pol)}, \n anth: {len(esco_anth)}, \n soc: {len(esco_soc)}')

### Finding matches between Jobindex data and Esco skills

In [None]:
def matches(esco, jobindex):
    matches = [x for x in esco if x in jobindex]
    return matches

print('Found the following matches between Esco Skills and skills in Jobindex Data:')
print(f'cand.oecon = {matches(esco_oecon, jobindex_oecon_full)} \n \
cand.psych = {matches(esco_psych, jobindex_psych_full)} \n \
cand.scient.soc = {matches(esco_soc, jobindex_soc_full)} \n \
cand.scient.ant = {matches(esco_anth, jobindex_anth_full)} \n \
cand.scient.pol = {matches(esco_pol, jobindex_pol_full)}')

## Scraping KU and other educational institutions

In [None]:
ku_list = []

search_list = ['psykologi', 'sociologi', 'statskundskab', 'antropologi', 'oekonomi']

for k in search_list:
    soup = extract_ku(k)
    ku_list.append(transform_ku(soup))

ku_df = pd.DataFrame(data=ku_list)

#Merging the two colums 
ku_df=ku_df[0]+ku_df[1]

#Making the object a dataframe
ku_df=pd.DataFrame(ku_df)

ku_df=pd.DataFrame.transpose(ku_df)

ku_df.columns=['cand.psych_ku', 'cand.scient.soc_ku', 'cand.scient.pol_ku', 'cand.scient.anth_ku', 'cand.oecon_ku']
soup = extract_au('statskundskab')

divs = soup.find_all("div", class_="large-8 medium-8 medium-only-portrait-12 small-12 columns")

text = soup.find_all('p')

text_stats = text[75:77]

stats_df = pd.DataFrame(data=text_stats, columns=['cand.scient.pol_au'])
stats = pd.DataFrame([', '.join(stats_df['cand.scient.pol_au'].to_list())], columns=['cand.scient.pol_au'])
soup = extract_au('oekonomi')

divs = soup.find_all("div", class_="large-8 medium-8 medium-only-portrait-12 small-12 columns")

text = soup.find_all('p')

text_oek = text[60:62]

oek_df = pd.DataFrame(data=text_oek, columns=['cand.oecon_au'])
oek = pd.DataFrame([', '.join(oek_df['cand.oecon_au'].to_list())], columns=['cand.oecon_au'])
soup = extract_au('antropologi')

divs = soup.find_all("div", class_="large-8 medium-8 medium-only-portrait-12 small-12 columns")

text = soup.find_all('p')

text_ant = text[50:55]

ant_df = pd.DataFrame(data=text_ant, columns=['cand.scient.anth_au'])
ant = pd.DataFrame([', '.join(ant_df['cand.scient.anth_au'].to_list())], columns=['cand.scient.anth_au'])
soup = extract_au('psykologi')

divs = soup.find_all("div", class_="large-8 medium-8 medium-only-portrait-12 small-12 columns")

text = soup.find_all('p')

text_psyk = text[74:78]

psyk_df = pd.DataFrame(data=text_psyk, columns=['cand.psych_au'])
psyk = pd.DataFrame([', '.join(psyk_df['cand.psych_au'].to_list())], columns=['cand.psych_au'])
frames = [ant, stats, psyk, oek]

au_df = pd.concat(frames, axis=1)

au_df_list = au_df['cand.scient.anth_au'].to_list()
au_list = []

for i in au_df_list:
    au_list.append(clean_text(i))

au_list
def transform_aau(soup):

    divs = soup.find_all("main", class_="Main_Main__2KIvG")

    for item in divs:
        text_aau = item.find_all()[0].text.strip()

        aau_text = {
            "text_aau" : text_aau, 
        }
        aau_list.append(aau_text)

        text_aau = clean_text(text_aau)
        
    return text_aau

aau_list = []

search_list = ['psykologi', 'sociologi', 'oekonomi']

for k in search_list:
    try: 
        soup = extract_aau(k)
        transform_aau(soup)
    except:
        break

aau_df = pd.DataFrame(data=aau_list)

aau_df=pd.DataFrame.transpose(aau_df)

aau_df.columns=['cand.psych_aau', 'cand.scient.soc_aau', 'cand.oecon_aau']

aau_df = aau_df.reset_index(drop=True)

aau_df

## Combining Dataframes
merge_frames = [ku_df, au_df, aau_df]

combined_df = pd.concat(merge_frames, axis=1)

combined_df['cand.psych_ku_string'] = [','.join(map(str, l)) for l in combined_df['cand.psych_ku']]
combined_df['cand.scient.pol_ku_string'] = [','.join(map(str, l)) for l in combined_df['cand.scient.pol_ku']]
combined_df['cand.oecon_ku_string'] = [','.join(map(str, l)) for l in combined_df['cand.oecon_ku']]
combined_df['cand.scient.anth_ku_string'] = [','.join(map(str, l)) for l in combined_df['cand.scient.anth_ku']]
combined_df['cand.scient.soc_ku_string'] = [','.join(map(str, l)) for l in combined_df['cand.scient.soc_ku']]

combined_df['cand.psych'] = combined_df['cand.psych_aau'] + combined_df['cand.psych_au'] + combined_df['cand.psych_ku_string']
combined_df['cand.scient.anth'] = combined_df['cand.scient.anth_au'] + combined_df['cand.scient.anth_ku_string']
combined_df['cand.scient.pol'] = combined_df['cand.scient.pol_au'] + combined_df['cand.scient.pol_ku_string']
combined_df['cand.scient.soc'] = combined_df['cand.scient.soc_aau'] + combined_df['cand.scient.soc_ku_string']
combined_df['cand.oecon'] = combined_df['cand.oecon_aau'] + combined_df['cand.oecon_au'] + combined_df['cand.oecon_ku_string']
combined_df.drop(['cand.psych_aau', 'cand.scient.soc_aau', 'cand.oecon_aau', 'cand.psych_au', \
                  'cand.scient.anth_au', 'cand.scient.pol_au','cand.oecon_au', \
                  'cand.psych_ku', 'cand.scient.soc_ku', 'cand.scient.pol_ku', 'cand.scient.anth_ku', \
                  'cand.oecon_ku', 'cand.psych_ku_string', 'cand.scient.pol_ku_string', 'cand.oecon_ku_string', \
                  'cand.scient.anth_ku_string', 'cand.scient.soc_ku_string'], axis=1, inplace=True)
university_df = combined_df
university_df

## Scraping UG
search_list = ['cand.psych', 'cand.oecon', 'cand.scient.pol', 'cand.scient.anth', 'cand.scient.soc']

education_url = []

for k in search_list:
    for i in range(1):
        try: 
            education_url.append(UG(i,k))
        except:
            break
        
education_url[1].pop(0)
del education_url[4][0:3]
psych = clean_text(extract_UG(education_url[0][0]))
oecon = extract_UG(education_url[1][0])
pol = extract_UG(education_url[2][0])
anth = extract_UG(education_url[3][0])
soc = extract_UG(education_url[4][0])

## Combining skills from UG and Universities
strings_psych = psych + university_df['cand.psych']
strings_oecon = oecon + university_df['cand.oecon']
strings_pol = pol + university_df['cand.scient.pol']
strings_anth = anth + university_df['cand.scient.anth']
strings_soc = soc + university_df['cand.scient.soc']


psych_comb = " ".join(strings_psych)
oecon_comb = " ".join(strings_oecon)
pol_comb = " ".join(strings_pol)
anth_comb = " ".join(strings_anth)
soc_comb = " ".join(strings_soc)
psych_series = pd.Series(psych_comb)
oecon_series = pd.Series(oecon_comb)
pol_series = pd.Series(pol_comb)
anth_series = pd.Series(anth_comb)
soc_series = pd.Series(soc_comb)

psych_df = pd.DataFrame(psych_series)
oecon_df = pd.DataFrame(oecon_series)
pol_df = pd.DataFrame(pol_series)
anth_df = pd.DataFrame(anth_series)
soc_df = pd.DataFrame(soc_series)

psych_df['cand.psych'] = psych_df[0]
oecon_df['cand.oecon'] = oecon_df[0]
pol_df['cand.scient.pol'] = pol_df[0]
anth_df['cand.scient.anth'] = anth_df[0]
soc_df['cand.scient.soc'] = soc_df[0]
final_df = pd.concat([psych_df['cand.psych'], oecon_df['cand.oecon'], pol_df['cand.scient.pol'], \
                      anth_df['cand.scient.anth'], soc_df['cand.scient.soc']], axis=1)

final_df = make_a_list(final_df['cand.psych']).append(make_a_list(final_df['cand.oecon'])).append(make_a_list(final_df['cand.scient.pol']))\
    .append(make_a_list(final_df['cand.scient.anth'])).append(make_a_list(final_df['cand.scient.soc']))

final_df = final_df.T
final_df.columns = ['cand.psych', 'cand.oecon', 'cand.scient.pol', 'cand.scient.anth', 'cand.scient.soc']

final_df['cand.psych'] = final_df['cand.psych'].apply(nltk.word_tokenize)
final_df['cand.oecon'] = final_df['cand.oecon'].apply(nltk.word_tokenize)
final_df['cand.scient.pol'] = final_df['cand.scient.pol'].apply(nltk.word_tokenize)
final_df['cand.scient.anth'] = final_df['cand.scient.anth'].apply(nltk.word_tokenize)
final_df['cand.scient.soc'] = final_df['cand.scient.soc'].apply(nltk.word_tokenize)

final_df['cand.psych'] = final_df['cand.psych'].apply(lambda words: [word for word in words if word not in stop_word])
final_df['cand.oecon'] = final_df['cand.oecon'].apply(lambda words: [word for word in words if word not in stop_word])
final_df['cand.scient.pol'] = final_df['cand.scient.pol'].apply(lambda words: [word for word in words if word not in stop_word])
final_df['cand.scient.anth'] = final_df['cand.scient.anth'].apply(lambda words: [word for word in words if word not in stop_word])
final_df['cand.scient.soc'] = final_df['cand.scient.soc'].apply(lambda words: [word for word in words if word not in stop_word])

In [None]:
def df_to_list(series):
    """
    This function lemmatizes and cleans data from dataframes and turn them into lists.
    """
    final_list_ = []
    job_list = []
    for i in series:
        list_ = list(i)
    list_ = ' '.join(list_)
    final_list = list_.split(' ')
    job = lemmatize_text(final_list)
    for sublist in job:
        job_list.append(' '.join(sublist))

    for word in job_list:
        if word not in stop_word:
            final_list_.append(word)

    return final_list_

udd_oecon_list = df_to_list(final_df['cand.oecon'])
udd_psych_list = df_to_list(final_df['cand.psych'])
udd_pol_list = df_to_list(final_df['cand.scient.pol'])
udd_anth_list = df_to_list(final_df['cand.scient.anth'])
udd_soc_list = df_to_list(final_df['cand.scient.soc'])

udd_oecon_string = ' '.join(udd_oecon_list)
udd_psych_string = ' '.join(udd_psych_list)
udd_anth_string = ' '.join(udd_anth_list)
udd_pol_string = ' '.join(udd_pol_list)
udd_soc_string = ' '.join(udd_soc_list)

### Plots of Wordcloud for Educations

In [None]:
generate_better_wordcloud(udd_oecon_string, (10,8))

In [None]:
generate_better_wordcloud(udd_psych_string, (10,8))

In [None]:
generate_better_wordcloud(udd_anth_string, (10,8))

In [None]:
generate_better_wordcloud(udd_soc_string, (10,8))

In [None]:
generate_better_wordcloud(udd_pol_string, (10,8))

In [None]:
def find_words(words_to_search, list_, name):
    for i in words_to_search:
        if i in list_:
            print(f'{i} is present in the list of {name}')

find_words(esco_oecon, udd_oecon_list, 'oecon')
find_words(esco_anth, udd_anth_list, 'anth')
find_words(esco_pol, udd_pol_list, 'pol')
find_words(esco_psych, udd_psych_list, 'psych')
find_words(esco_soc, udd_soc_list, 'soc')

In [None]:
# Since the UFM data source https://datavarehus.ufm.dk/rapporter/ledighed does not provide any API we download the data
# and store it in our github repo:
df_ufm_1 = pd.read_csv("ledighed_drivkraft.csv", sep=';', decimal=',', header=[1], skipinitialspace=True)

df_ufm_2 = pd.read_csv("ledighed.csv", skipinitialspace = True, sep=',', decimal='.')

# We make a sub dataframe for the educations within the fields of social sciences (samf):
df_ss = df_ufm_2.iloc[118:132, :].copy()
df_ss.reset_index(drop=True)

# Replace "," notation with "." notation and replace "%" signs with an empty string:
df_ss['Ledighedsgrad'] = [x.replace(',', '.') for x in df_ss['Ledighedsgrad']].copy()
df_ss['Ledighedsgrad'] = [x.replace('%', '') for x in df_ss['Ledighedsgrad']].copy()

# Convert colum 'Ledighedsgrad' into float type:
df_ss['Ledighedsgrad'] = df_ss['Ledighedsgrad'].astype(float)

ledighed_jura = df_ss.iloc[0:3, 4].sum() / 3
gen_ledighed_jura = "{:.2f}".format(ledighed_jura)
print(f'Den gennemsnitlige ledighed for jura er {gen_ledighed_jura} pct.')

ledighed_ervøko = df_ss.iloc[3:6, 4].sum() / 3
gen_ledighed_ervøko = "{:.2f}".format(ledighed_ervøko)
print(f'Den gennemsnitlige ledighed for erhvervsøkonomi er {gen_ledighed_ervøko} pct.')

ledighed_forval = df_ss.iloc[6:9, 4].sum() / 3
gen_ledighed_forval = "{:.2f}".format(ledighed_forval)
print(f'Den gennemsnitlige ledighed for forvaltning er {gen_ledighed_forval} pct.')

ledighed_psyko = df_ss.iloc[9:12, 4].sum() / 3
gen_ledighed_psyko = "{:.2f}".format(ledighed_psyko)
print(f'Den gennemsnitlige ledighed for psykologi er {gen_ledighed_psyko} pct.')

ledighed_øvrig = df_ss.iloc[12:15, 4].sum() / 3
gen_ledighed_øvrig = "{:.2f}".format(ledighed_øvrig)
print(f'Den gennemsnitlige ledighed for øvrige samfundsvidenskabelige uddannelser er {gen_ledighed_øvrig} pct.')

colors = ['#069AF3','lightblue','#929591','darkseagreen','teal']
objects = ('Law', 'Business Economics', 'Administration', 'Psychology', 'Others')
y_pos = np.arange(len(objects))
values = [ledighed_jura, ledighed_ervøko, ledighed_forval, ledighed_psyko, ledighed_øvrig]
plt.rcParams['axes.facecolor'] = 'white' # change background color
plt.bar(y_pos, values, align='center', alpha=0.7, color=colors)
plt.xticks(y_pos, objects, rotation=0, fontsize=8)
plt.ylabel('Unemployment Rate, %', fontsize=10)
plt.yticks(fontsize=10)
plt.savefig('Unemployment_ss2.png', facecolor="white", bbox_inches='tight',transparent=True, pad_inches=0)
plt.tight_layout()

In [None]:
df_ufm_1.drop(df_ufm_1.tail(3).index,inplace=True) # drop last three rows
df_ufm_1 = df_ufm_1.replace(' %', '', regex=True)
df_ufm_1 = df_ufm_1.replace(',', '.', regex=True)
df_ufm_1['Ledighedsgrad'] = [x.replace(',', '.') for x in df_ufm_1['Ledighedsgrad']].copy()
df_ufm_1['Ledighedsgrad'] = [x.replace('%', '') for x in df_ufm_1['Ledighedsgrad']].copy()

df_ufm_1['Ledighedsgrad.1'] = [x.replace(',', '.') for x in df_ufm_1['Ledighedsgrad.1']].copy()
df_ufm_1['Ledighedsgrad.1'] = [x.replace('%', '') for x in df_ufm_1['Ledighedsgrad.1']].copy()

df_ufm_1['Ledighedsgrad.2'] = [x.replace(',', '.') for x in df_ufm_1['Ledighedsgrad.2']].copy()
df_ufm_1['Ledighedsgrad.2'] = [x.replace('%', '') for x in df_ufm_1['Ledighedsgrad.2']].copy()

df_ufm_1['Ledighedsgrad'] = df_ufm_1['Ledighedsgrad']

df_ufm_1['Ledighedsgrad.1'] = df_ufm_1['Ledighedsgrad.1']

df_ufm_1['Ledighedsgrad.2'] = df_ufm_1['Ledighedsgrad.2']

antal_ledige_samf_2017 = round((float(df_ufm_1.iloc[3, 2]) / 100) * (float(df_ufm_1.iloc[3, 3]) * 1000))

antal_ledige_samf_2018 = round((float(df_ufm_1.iloc[3, 4]) / 100) * (float(df_ufm_1.iloc[3, 5]) * 1000))

antal_ledige_samf_2019 = round((float(df_ufm_1.iloc[3, 6]) / 100) * (float(df_ufm_1.iloc[3, 7]) * 1000))

antal_ledige_samf_samlet = antal_ledige_samf_2017 + antal_ledige_samf_2018 + antal_ledige_samf_2019
antal_ledige_human_2017 = round((float(df_ufm_1.iloc[4, 2]) / 100) * (float(df_ufm_1.iloc[4, 3]) * 1000))

antal_ledige_human_2018 = round((float(df_ufm_1.iloc[4, 4]) / 100) * (float(df_ufm_1.iloc[4, 5]) * 1000))

antal_ledige_human_2019 = round((float(df_ufm_1.iloc[4, 6]) / 100) * (float(df_ufm_1.iloc[4, 7]) * 1000))

antal_ledige_human_samlet = antal_ledige_human_2017 + antal_ledige_human_2018 + antal_ledige_human_2019
antal_ledige_teknik_2017 = round((float(df_ufm_1.iloc[5, 2]) / 100) * (float(df_ufm_1.iloc[5, 3]) * 1000))

antal_ledige_teknik_2018 = round((float(df_ufm_1.iloc[5, 4]) / 100) * (float(df_ufm_1.iloc[5, 5]) * 1000))

antal_ledige_teknik_2019 = round((float(df_ufm_1.iloc[5, 6]) / 100) * (float(df_ufm_1.iloc[5, 7]) * 1000))

antal_ledige_teknik_samlet = antal_ledige_teknik_2017 + antal_ledige_teknik_2018 + antal_ledige_teknik_2019
antal_ledige_natur_2017 = round((float(df_ufm_1.iloc[6, 2]) / 100) * (float(df_ufm_1.iloc[6, 3]) * 1000))

antal_ledige_natur_2018 = round((float(df_ufm_1.iloc[6, 4]) / 100) * (float(df_ufm_1.iloc[6, 5]) * 1000))

antal_ledige_natur_2019 = round((float(df_ufm_1.iloc[6, 6]) / 100) * (float(df_ufm_1.iloc[6, 7]) * 1000))

antal_ledige_natur_samlet = antal_ledige_natur_2017 + antal_ledige_natur_2018 + antal_ledige_natur_2019
print(antal_ledige_natur_samlet)
antal_ledige_sundh_2017 = round((float(df_ufm_1.iloc[7, 2]) / 100) * (float(df_ufm_1.iloc[7, 3]) * 1000))

antal_ledige_sundh_2018 = round((float(df_ufm_1.iloc[7, 4]) / 100) * (float(df_ufm_1.iloc[7, 5]) * 1000))

antal_ledige_sundh_2019 = round((float(df_ufm_1.iloc[7, 6]) / 100) * (float(df_ufm_1.iloc[7, 7]) * 1000))

antal_ledige_sundh_samlet = antal_ledige_sundh_2017 + antal_ledige_sundh_2018 + antal_ledige_sundh_2019
print(antal_ledige_sundh_samlet)
# Pie chart
labels = ["Social Sciences", "Humanities", "Technology", "Natural Sciences", "Health Sciences"]

#colors
colors = ['#069AF3','lightblue','#929591','darkseagreen','teal']
y = np.array([antal_ledige_samf_samlet, antal_ledige_human_samlet, antal_ledige_teknik_samlet, antal_ledige_natur_samlet, antal_ledige_sundh_samlet])

fig1, ax1 = plt.subplots()
patches, texts, autotexts = ax1.pie(y, colors = colors, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 18})
for text in texts:
    text.set_color('black')
for autotext in autotexts:
    autotext.set_color('black')
plt.rcParams["figure.figsize"] = (16,10)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
plt.savefig('Unemployment_driving_force.png', facecolor="white", bbox_inches='tight',transparent=True, pad_inches=0)
plt.show()


## Logistic regression and plots

In [None]:
# Initally we define the values of the variables 
unem_psy = 10
unem_soc = 15
unem_ant = 20
unem_pol = 5
unem_eco = 2
cosine_psy = 0.280990
cosine_soc = 0.163015
cosine_ant = 0.081864
cosine_pol = 0.140346
cosine_eco = 0.424609

# We contruct a numpy array to fit the relevant data into a pandas DF  

array = np.array([[cosine_ant, unem_ant],[cosine_soc, unem_soc],[cosine_psy, unem_psy],[cosine_pol, unem_pol],[cosine_eco, unem_eco]])

# Creating a list of column and index names
index_values = ['Anthropology', 'Sociology', 'Psychology',
                'Political Science', 'Economics'] 

column_values = ['Cosine Similarity', 'Unemployment_Diff']
 
# Creating the dataframe
df = pd.DataFrame(data = array,
                  index = index_values,
                  columns = column_values)

In [None]:
# Define response variable
y = df['Unemployment_Diff']

# Define predictor variables
x = df['Cosine Similarity']

# Add constant to predictor variables
x = sm.add_constant(x)

# Fit linear regression model
model = sm.OLS(y, x).fit()

sns.set(rc = {'figure.figsize':(10,5)})

# View model summary
print(model.summary())

# Make scatter plot
figure, axes = plt.subplots(1, 2, sharex=True)
axes[0].set_title('(I) Regression plot excluded outlier', fontweight="bold")
axes[1].set_title('(II) Regression plot included outlier', fontweight="bold")
sns.scatterplot(ax=axes[1], x='Cosine Similarity', # Horizontal axis
                y='Unemployment_Diff', # Vertical axis
                data=df) # Data source

# Change color of outlier data point on the plot
plt.scatter(df["Cosine Similarity"].iloc[-2], df.Unemployment_Diff.iloc[-2], color='orange')

# Reshaping the independent and dependent variable for regression
y1 = df["Unemployment_Diff"].values.reshape(-1,1)
X1 = df["Cosine Similarity"].values.reshape(-1,1)


# Setting Simple Linear Regression up
simple1 = LinearRegression();

# Fitting to the model
simple1.fit(X1,y1);

# Calculating prediction
pred1 = simple1.predict(X1)

# Dropping row with political science from DataFrame
df.drop(["Political Science"], axis=0, inplace=True)

# Redefining the independent and dependent variable
y2 = df["Unemployment_Diff"].values.reshape(-1,1)
X2 = df["Cosine Similarity"].values.reshape(-1,1)

# Setting Simple Linear Regression up again
simple2 = LinearRegression();

# Fitting to the model
simple2.fit(X2,y2);

# Calculating new prediction
pred2 = simple2.predict(X2)


sns.scatterplot(ax=axes[0], x='Cosine Similarity', # Horizontal axis
                y='Unemployment_Diff', # Vertical axis
                data=df) # Data source
axes[1].plot(X1,pred1, '-b')
axes[0].plot(X2,pred2, '-r')
# Set x-axis labels
axes[0].set_xlabel("tf-idf Cosine Similarity")
axes[1].set_xlabel("tf-idf Cosine Similarity")
# Set y-axis label
axes[0].set_ylabel("Core Unemployment, per cent")
axes[1].set_ylabel("Core Unemployment, per cent")

# label points on the plot (I)
axes[0].text(0.086, 19.9, "cand.psych", horizontalalignment='left', size='small', color='black')
axes[0].text(0.159, 15, "cand.scient.soc", horizontalalignment='right', size='small', color='black')
axes[0].text(0.285 , 10, "cand.scient.anth", horizontalalignment='left', size='small', color='black')
axes[0].text(0.415 , 2.1, "cand.econ", horizontalalignment='right', size='small', color='black')
# label points on the plot (II)
axes[1].text(0.15, 5, "cand.scient.pol", horizontalalignment='left', size='small', color='black')
axes[1].text(0.086, 19.9, "cand.psych", horizontalalignment='left', size='small', color='black')
axes[1].text(0.17, 15, "cand.scient.soc", horizontalalignment='left', size='small', color='black')
axes[1].text(0.287 , 10, "cand.scient.anth", horizontalalignment='left', size='small', color='black')
axes[1].text(0.415 , 2.1, "cand.econ", horizontalalignment='right', size='small', color='black')
# plt.savefig('Regression_fig.png', bbox_inches='tight', pad_inches=0, dpi=300)
plt.tight_layout()

# Redefine response variable after the outlier has been dropped
y = df['Unemployment_Diff']

# Redefine predictor variables
x = df['Cosine Similarity']

# Add constant to predictor variables
x = sm.add_constant(x)

# Fit linear regression model
model = sm.OLS(y, x).fit()

# view model summary
print(model.summary())

### Vectorizing words with TfidfVectorizer

In [None]:
def make_vectorizer(input_udd, input_jobindex):
    """
    This funtion defines a vectorizer to be used on string data from jobindex.dk and education websites.
    It then calculates the cosine similarity between the strings.
    """
    list_ = []
    _list = []
    vector = TfidfVectorizer(lowercase=False)
    analyzer = ([input_udd] + [input_jobindex]) #The order does not make a difference here
    ret = vector.fit_transform(analyzer)
    pairwise_similarity = ret * ret.T
    array = pairwise_similarity.toarray()
    for cell in np.nditer(array):
        if cell < 0.9 and cell > 0.001:
            list_.append(cell)

    list_ = list_[1].tolist()

    return list_

list_ = (make_vectorizer(udd_oecon_string, jobindex_oecon_full),
make_vectorizer(udd_psych_string, jobindex_psych_full), 
make_vectorizer(udd_pol_string, jobindex_pol_full), 
make_vectorizer(udd_soc_string, jobindex_soc_full),
make_vectorizer(udd_anth_string, jobindex_anth_full))

df_sim = pd.DataFrame(list_)
df_tfidf = df_sim.rename(index={0:'cand.oecon', 1:'cand.psych', 2:'cand.scient.pol', 3:'cand.scient.soc', 4:'cand.scient.anth'})
df_tfidf.columns=['Tfidf: Cosine Similarity']

### Finding similarities with Spacy

In [None]:
oecon_udd_nlp = nlp(udd_oecon_string)
soc_udd_nlp = nlp(udd_soc_string)
anth_udd_nlp = nlp(udd_anth_string)
psych_udd_nlp = nlp(udd_psych_string)
pol_udd_nlp = nlp(udd_pol_string)

oecon_job_nlp = nlp(jobindex_oecon_full)
soc_job_nlp = nlp(jobindex_soc_full)
anth_job_nlp = nlp(jobindex_anth_full)
psych_job_nlp = nlp(jobindex_psych_full)
pol_job_nlp = nlp(jobindex_pol_full)

print(f'cand.oecon similarity: {oecon_job_nlp.similarity(oecon_udd_nlp)} \n \
cand.soc similarity: {soc_job_nlp.similarity(soc_udd_nlp)} \n \
cand.anth similarity: {anth_job_nlp.similarity(anth_udd_nlp)} \n \
cand.psych similarity: {psych_job_nlp.similarity(psych_udd_nlp)} \n \
cand.pol similarity: {pol_job_nlp.similarity(pol_udd_nlp)}')

list_spacy = (oecon_job_nlp.similarity(oecon_udd_nlp), soc_job_nlp.similarity(soc_udd_nlp), anth_job_nlp.similarity(anth_udd_nlp), 
psych_job_nlp.similarity(psych_udd_nlp), pol_job_nlp.similarity(pol_udd_nlp))
df_spacy = pd.DataFrame(data=list_spacy, columns=['SpaCy: Cosine Similarity'])
df_spacy = df_spacy.rename(index={0:'cand.oecon', 1:'cand.psych', 2:'cand.scient.pol', 3:'cand.scient.soc', 4:'cand.scient.anth'})

### Vectorizing words with Bert

In [None]:
def jobindex_without_stopwords(search_word):
    """
    This function does the same as clean_jobindex just without lemmatizing or removing stopwords. 
    This way string data is being prepared to be used in the BERT model.
    """
    final_job = []
    jobindex = []
    info_job = [] 
    flat_list = []
    final_list = []
    job_list = []

    headers = {'User-Agent':'kjp538@alumni.ku.dk'}
    job = extract_jobindex(search_word, 4)
    
    for i in job:
        url = f"{i}"
        r = requests.get(url, headers)
        time.sleep(0.5)
        soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
        iframe = soup.find_all('iframe', class_='archive-content')
        for i in iframe:
            link = i['src']
            info_job.append(link)

    for item in info_job:
        links = 'http://www.jobindexarkiv.dk/cgi/showarchive.cgi' + item
        jobindex.append(links)

    for i in jobindex:
        url_ = f"{i}"
        r = requests.get(url_, headers)
        time.sleep(0.5)
        soup = BeautifulSoup(r.content, "html.parser")
        content = soup.find_all('body')
        for i in content:
            text = i.get_text()
            flat_list.append(text)
    job_ = ' '.join(str(i) for i in flat_list)
    _job = clean_text(job_)
    
    return job_list

def udd_without_stopwords(series):
    final_list_ = []
    job_list = []
    for i in series:
        list_ = list(i)
    list_ = ' '.join(list_)

    return list_

In [None]:
udd_oecon_bert = udd_without_stopwords(final_df['cand.oecon'])
udd_psych_bert = udd_without_stopwords(final_df['cand.psych'])
udd_pol_bert = udd_without_stopwords(final_df['cand.scient.pol'])
udd_anth_bert = udd_without_stopwords(final_df['cand.scient.anth'])
udd_soc_bert = udd_without_stopwords(final_df['cand.scient.soc'])

jobindex_bert_psych = jobindex_without_stopwords('cand.psych')
jobindex_bert_oecon = jobindex_without_stopwords('cand.oecon')
jobindex_bert_pol = jobindex_without_stopwords('cand.scient.pol')
jobindex_bert_anth = jobindex_without_stopwords('cand.scient.anth')
jobindex_bert_soc = jobindex_without_stopwords('cand.scient.soc')

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

def bert_model_simmilarities(udd_list, jobindex_list):
    list_ = []
    document_embeddings = sbert_model.encode([udd_list, jobindex_list])
    similarities = cosine_similarity(document_embeddings)
    for cell in np.nditer(similarities):
        if cell < 0.9 and cell > 0.1:
            list_.append(cell)

    return list_[1].tolist()

arr_oecon = bert_model_simmilarities(udd_oecon_list, jobindex_oecon)
arr_psych = bert_model_simmilarities(udd_psych_list, jobindex_psych)
arr_pol = bert_model_simmilarities(udd_pol_list, jobindex_pol)
arr_soc = bert_model_simmilarities(udd_soc_list, jobindex_soc)
arr_anth = bert_model_simmilarities(udd_anth_list, jobindex_anth)

list_ = (arr_oecon, arr_psych, arr_pol, arr_soc, arr_anth)
df_bert = pd.DataFrame(list_)

df_bert = df_bert.rename(index={0:'cand.oecon', 1:'cand.psych', 2:'cand.scient.pol', 3:'cand.scient.soc', 4:'cand.scient.anth'})
df_bert.columns=['Bert: Cosine Similarity']

In [None]:
arr_oecon_2 = bert_model_simmilarities(udd_oecon_bert, jobindex_bert_oecon)
arr_psych_2 = bert_model_simmilarities(udd_psych_bert, jobindex_bert_psych)
arr_pol_2 = bert_model_simmilarities(udd_pol_bert, jobindex_bert_pol)
arr_soc_2 = bert_model_simmilarities(udd_soc_bert, jobindex_bert_soc)
arr_anth_2 = bert_model_simmilarities(udd_anth_bert, jobindex_bert_anth)

_bert = (arr_oecon_2, arr_psych_2, arr_pol_2, arr_soc_2, arr_anth_2)
df_bert_2 = pd.DataFrame(_bert)

df_bert_2 = df_bert_2.rename(index={0:'cand.oecon', 1:'cand.psych', 2:'cand.scient.pol', 3:'cand.scient.soc', 4:'cand.scient.anth'})
df_bert_2.columns=['Bert_2: Cosine Similarity']
df_bert = pd.concat([df_bert, df_bert_2], axis=1)

### Combining all data

In [None]:
df = pd.concat([df_bert, df_tfidf], axis=1)
df = pd.concat([df, df_spacy], axis=1)
print(df.to_latex(index = True, multirow = True))

### Plotting words

In [None]:
frequency_plot(jobindex_oecon)

In [None]:
frequency_plot(jobindex_psych)

In [None]:
frequency_plot(jobindex_pol)

In [None]:
frequency_plot(jobindex_soc)

In [None]:
frequency_plot(jobindex_anth)

In [None]:
frequency_plot(udd_oecon_list)

In [None]:
frequency_plot(udd_psych_list)

In [None]:
frequency_plot(udd_pol_list)

In [None]:
frequency_plot(udd_soc_list)

In [None]:
frequency_plot(udd_anth_list)