## Testing for data scrabing on Esco and Jobnet

In [None]:
# Import magics

# !pip install selenium
# !pip install webdriver_manager
# !python -m nltk.downloader popular
# !pip install tensorflow
# !pip install spacy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from io import StringIO
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras import layers
stop_word = nltk.corpus.stopwords.words('danish')
from spacy.lang.da.stop_words import STOP_WORDS
import string
from function import *

output = Path(r'/Users/nicolaibernsen/Desktop/KU/9.Semester/Introduction_to_Social_Datascience/ISDS_edit/Exam/output')

## Selecting data on Jobindex

In [None]:
def extract_jobindex(page, tag):
    
    headers = {'User-Agent':'kjp538@alumni.ku.dk'}

    url = f"https://www.jobindex.dk/jobsoegning?page={page}&q={tag}"
                    
    r = requests.get(url, headers)
        
    soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
                    
    divs = soup.find_all("div", class_="jobsearch-result")         

    for item in divs:
        title = item.find_all("b")[0].text.strip()
        #company = item.find_all("b")[1].text.strip()
        #published_date = item.find("time").text.strip()
        summary = item.find_all("p")[1].text.strip()
        #job_location = item.find_all("p")[0].text.strip()
        #job_url =  item.select_one('[data-click*="u="]:has(> b)')['href']
                        

        job = {
        "job_title" : tag,
        "title" : title, 
        #"company" : company,
        #"published_date" : published_date,
        "summary" : summary,
        #"job_location" : job_location,
        #"job_url" : job_url
        }

        job_list.append(job)
        
    return

search_list = ['cand.psych', 'cand.polit', 'cand.scient.pol', 'cand.scient.anth', 'cand.scient.soc']
job_list = []
for k in search_list:  
    for i in range(100):
        try: 
            job_df = extract_jobindex(i,k)
        except:
            break

job_df = pd.DataFrame(data=job_list, columns=['job_title', "title", "summary"])

## Importing skills from ESCO

In [None]:
occupations = ['http://data.europa.eu/esco/occupation/99492920-e5a5-4dba-9e5a-93193147198c', 
'http://data.europa.eu/esco/occupation/11df8941-508c-4103-ad40-52cdf9430a59', 
'http://data.europa.eu/esco/occupation/acf69cab-8629-45c8-ae10-c8fb15f474b6', 
'http://data.europa.eu/esco/occupation/52ded7d7-11df-42e3-b90a-d7f4b70fb4b9',
'http://data.europa.eu/esco/occupation/4f89b0d2-b666-4890-af01-25d1d60da1f3']

jobs = pd.DataFrame(columns=['job_title', 'essential_skill', 'optional_skill'])

for i in occupations:
    jobs = jobs.append(fetching_occupation(i))

jobs = jobs.apply(lambda x: x.replace({'økonom':'cand.polit', 'psykolog':'cand.psych', 'antropolog':'cand.scient.anth', 
'politolog':'cand.scient.pol', 'sociolog':'cand.scient.soc'}, regex=True))

## Scraping UG

In [None]:
search_list = ['cand.psych', 'cand.oecon', 'cand.scient.pol', 'cand.scient.anth', 'cand.scient.soc']

education_url = []

for k in search_list:
    for i in range(1):
        try: 
            education_url.append(UG(i,k))
        except:
            break
        
education_url[1].pop(0)
del education_url[4][0:3]

In [None]:
psych = nltk.word_tokenize(extract_UG(education_url[0][0]))
oecon = nltk.word_tokenize(extract_UG(education_url[1][0]))
pol = nltk.word_tokenize(extract_UG(education_url[2][0]))
anth = nltk.word_tokenize(extract_UG(education_url[3][0]))
soc = nltk.word_tokenize(extract_UG(education_url[4][0]))
psych_final = []
for word in psych:
    if word not in stop_word:
        psych_final.append(word)

psych_final

In [None]:
write_text('psych', psych_final)

In [None]:
psych_text = tf.data.TextLineDataset('/Users/nicolaibernsen/Desktop/KU/9.Semester/Introduction_to_Social_Datascience/ISDS_edit/Exam/psych.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(psych_text.batch(4096))
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE
text_vector_ds = psych_text.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

In [None]:
for seq in sequences[:10]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")