In [3]:
import requests
import pandas as pd
import pymysql
import sqlalchemy as db
from sqlalchemy import create_engine

In [3]:
# creating the connection with database

sqlitedb_path = 'data/raw_data_project_m1.db'

conn_str = f'sqlite:///{sqlitedb_path}'

engine = create_engine(conn_str)
connection = engine.connect()
metadata = db.MetaData()

In [4]:
# creating dataframe from sql query to keep the data we'll need

data = pd.read_sql_query("""
SELECT career_info.uuid,
career_info.normalized_job_code,
country_info.country_code,
personal_info.age_group
FROM career_info
JOIN country_info
ON country_info.uuid = career_info.uuid
JOIN personal_info
ON personal_info.uuid = career_info.uuid;
""", engine)

In [5]:
data

Unnamed: 0,uuid,normalized_job_code,country_code,age
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,,AT,61 years old
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,861a9b9151e11362eb3c77ca914172d0,AT,57 years old
2,83127080-da3d-0133-c74f-0a81e8b09a82,,AT,32 years old
3,15626d40-db13-0133-ea5c-0a81e8b09a82,049a3f3a2b5f85cb2971ba77ad66e10c,AT,45 years old
4,24954a70-db98-0133-4a64-0a81e8b09a82,f4b2fb1aa40f661488e2782b6d57ad2f,AT,41 years old
...,...,...,...,...
9644,7d1ac020-dcb4-0133-817a-0a81e8b09a82,847165cfda6b1dc82ae22b967da8af2f,SK,37 years old
9645,39f989f0-db52-0133-8482-0a81e8b09a82,a4d5b8b38f9513825d0d94a981ebe962,SK,53 years old
9646,70ce4a90-d965-0133-f5e4-0a81e8b09a82,,SK,1992
9647,2896e440-db3c-0133-5b67-0a81e8b09a82,775190277a849cba701b306a7b374c0a,SK,47 years old


In [6]:
# cleaning age column to keep it simple, readable

age_column = [i.strip(' years old') for i in data['age']]

data['age'] = age_column

In [7]:
# some of the values are the birth year so i'm going to calculate the age these people had in 2016 according to the rest of the ages in age column

def calculating_age(year):
    if len(year) == 4:
        return 2016 - round(int(year))
    else:
        return year

data['age'] = data['age'].apply(calculating_age)

In [8]:
# now that we have all values as ages i'm calculating the age these people have now (only 4 years more because 2021 just started)

data['age'] = data['age'].apply(lambda x : int(x) + 4)

In [9]:
# now we're going to create a list with unique job codes

job_code_list = data['normalized_job_code'].unique().tolist()

In [10]:
# from these codes, we're creating urls to create the connection with API 

url_list = []

for i in job_code_list:
    url_list.append(f'http://api.dataatwork.org/v1/jobs/{i}')

In [11]:
# here we'll have a list with the json data we're getting from the connection with the API

lst = []

for url in url_list:
    response = requests.get(url)
    json_data = response.json()
    lst.append(json_data)

In [13]:
job_titles_df = pd.DataFrame(lst)
job_titles_df

Unnamed: 0,error,uuid,title,normalized_job_title,parent_uuid
0,"{'code': 404, 'message': 'Cannot find job with...",,,,
1,,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner,automatic data processing planner,0148f61d4227497728ce33490843d056
2,,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator,data coordinator,0b9dd32a367f4562ec77b993053d1910
3,,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer,database developer,b90ca4df5690002377a7b0f1f3d40781
4,,27af8700f5577cec835acee2cb90a2ff,Data Entry Specialist,data entry specialist,bf88358c56bb6cbe7eabec38ae333d19
...,...,...,...,...,...
152,,b0fa6ede410f50b82ab74f5a705fe699,Analytical Data Miner,analytical data miner,9656fa2dc39e9643c00a45858e0117d0
153,,559a21f836c93876f31b60e6d10656a7,Data Analysis Assistant,data analysis assistant,325e2251e20170928426473156bd3c48
154,,c1fb1a01b78373ac2153c66fa08d16dc,Data Examination Clerk,data examination clerk,da412504dd7b130414b7bbfa2acd563b
155,,05bb9a333a66d6eb151e253623efe1c0,Data Entry Clerk,data entry clerk,bf88358c56bb6cbe7eabec38ae333d19


In [14]:
# now we are merging our main df with this one to have the job titles instead of job codes

merged_db = data.merge(job_titles_df, left_on='normalized_job_code', right_on='uuid')

In [15]:
# dropping columns we won't need

cleaning_df = merged_db.drop(columns = ['normalized_job_code', 'error', 'uuid_y', 'normalized_job_title', 'parent_uuid'])

In [16]:
cleaning_df

Unnamed: 0,uuid_x,country_code,age,title
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,AT,65,
1,83127080-da3d-0133-c74f-0a81e8b09a82,AT,36,
2,b50dbb80-da53-0133-8956-0a81e8b09a82,AT,30,
3,9949c4c0-da5f-0133-c832-0a81e8b09a82,AT,26,
4,69f1f400-dc5f-0133-ad9b-0a81e8b09a82,AT,62,
...,...,...,...,...
9644,1cb1aac0-d94c-0133-8baa-0a81e8b09a82,IT,62,Data Capture Clerk
9645,c8c33390-da69-0133-063a-0a81e8b09a82,PL,48,Data Capture Clerk
9646,d27d24d0-d9b1-0133-03d4-0a81e8b09a82,PL,28,Data Capture Clerk
9647,529f3080-d99a-0133-1b7b-0a81e8b09a82,PT,44,Data Capture Clerk


In [17]:
# changing null values to string Unemployed

changing_nulls = cleaning_df.fillna('Unemployed')

In [18]:
# changing some column names to be nearest from the final result we're looking for

changing_column_names = changing_nulls.rename(columns = {'uuid_x': 'Id', 'age' : 'Age', 'title' : 'Job Title'})

In [19]:
# sorting values by age 

df_in_process = changing_column_names.sort_values(by='Age')

In [20]:
df_in_process

Unnamed: 0,Id,country_code,Age,Job Title
241,45a5a440-d974-0133-15c7-0a81e8b09a82,CZ,18,Unemployed
340,05b4f5e0-de48-0133-670d-0a81e8b09a82,DE,18,Unemployed
3880,83c61df0-d94c-0133-03ed-0a81e8b09a82,SE,18,Unemployed
8106,ccf1b0b0-d97d-0133-0d84-0a81e8b09a82,FR,18,Data Miner
2743,23bbbbb0-d8d1-0133-58d2-0a81e8b09a82,IT,18,Unemployed
...,...,...,...,...
154,e3efe070-da41-0133-5bfc-0a81e8b09a82,BG,69,Unemployed
3722,161b07b0-db10-0133-0eba-0a81e8b09a82,PT,69,Unemployed
2624,5a9f9a30-da6b-0133-74d1-0a81e8b09a82,HU,69,Unemployed
3506,8e4b56b0-dc23-0133-4b18-0a81e8b09a82,PL,69,Unemployed


In [25]:
df_in_process.to_csv('/home/emily/Desktop/PROYECTO 1/data/processed_df.csv', index=False)

In [5]:
df = pd.read_csv('data/processed_df.csv')

In [6]:
df

Unnamed: 0,Id,country_code,Age,Job Title
0,45a5a440-d974-0133-15c7-0a81e8b09a82,CZ,18,Unemployed
1,05b4f5e0-de48-0133-670d-0a81e8b09a82,DE,18,Unemployed
2,83c61df0-d94c-0133-03ed-0a81e8b09a82,SE,18,Unemployed
3,ccf1b0b0-d97d-0133-0d84-0a81e8b09a82,FR,18,Data Miner
4,23bbbbb0-d8d1-0133-58d2-0a81e8b09a82,IT,18,Unemployed
...,...,...,...,...
9644,e3efe070-da41-0133-5bfc-0a81e8b09a82,BG,69,Unemployed
9645,161b07b0-db10-0133-0eba-0a81e8b09a82,PT,69,Unemployed
9646,5a9f9a30-da6b-0133-74d1-0a81e8b09a82,HU,69,Unemployed
9647,8e4b56b0-dc23-0133-4b18-0a81e8b09a82,PL,69,Unemployed


In [8]:
df['Age'].unique()

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69])