In [1]:
import requests
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

In [2]:
def database_connection(path):
    """
    creating connection to sqlite database and creating a dataframe with the information we will need from its tables
    """

    conn_str = f'sqlite:///{path}'

    engine = create_engine(conn_str)

    data = pd.read_sql_query("""
        SELECT career_info.uuid,
        career_info.normalized_job_code,
        country_info.country_code,
        personal_info.age_group
        FROM career_info
        JOIN country_info
        ON country_info.uuid = career_info.uuid
        JOIN personal_info
        ON personal_info.uuid = career_info.uuid;
        """, engine)

    raw_db_data = data.to_csv('../data/raw/data_from_db.csv')
    return raw_db_data

In [3]:
def creating_url_list(rawdb):
    """
    creating a list of urls with each job code
    """

    raw_db_data = pd.read_csv(rawdb)

    job_code_list = raw_db_data['normalized_job_code'].unique().tolist()
    url_list = []
    for code in job_code_list:
        url_list.append(f'http://api.dataatwork.org/v1/jobs/{code}')

    return url_list

In [4]:
def connecting_api(url_list):
    """
    creating the connection to the api where we can find the job titles and creating a dataframe with them
    """

    json_data_list = []

    for url in url_list:
        response = requests.get(url)
        json_data = response.json()
        json_data_list.append(json_data)

    job_titles_df = (pd.DataFrame(json_data_list)).to_csv('data/raw/job_titles_df.csv')

    return job_titles_df

In [6]:
def web_scraping(url):
    """
    web scraping to eurostat page to extract a table with the country names and country codes, then creating a
    dataframe with this info
    """

    html = requests.get(url).content
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')

    items = [x.text for x in table.find_all('td')]
    clean_items = [i.strip('\n') for i in items]

    countrys = []
    country_codes = []

    for i in clean_items:
        if i.startswith('('):
            country_codes.append(i[1:-1])
        else:
            countrys.append(i)

    country_dict = dict(zip(countrys, country_codes))
    country_df = pd.DataFrame.from_dict(country_dict, orient='index')

    country_names_csv = country_df.to_csv('data/raw/country_names.csv')

    return country_names_csv

In [7]:
def acquire(path, rawdb, url):

    print('connecting sqlite database and extracting a dataframe...')
    database_connection(path)

    print('establishing connection with the API...')
    creating_url_list(rawdb)

    print('creating dataframe with job titles...')
    url_list = creating_url_list(rawdb)
    connecting_api(url_list)

    print('creating dataframe with country names...')
    web_scraping(url)