In [83]:
import pandas as pd
import json
import requests

In [84]:
df = pd.read_csv('../data/raw/full_data.csv')

In [85]:
data = [df['country_code'], df['normalized_job_code'], df['rural']]
headers = ['Country', 'Job Title', 'Rural']
df_processed = pd.concat(data, axis=1, keys=headers)

## Processing Data

In [86]:
df_processed['Rural'] = df_processed['Rural'].str.capitalize()

In [87]:
df_processed = df_processed.replace(['Countryside', 'Country'], 'Rural')
df_processed = df_processed.replace(['City', 'Urban'], 'Non-rural')

In [88]:
df_processed.Rural.unique()

array(['Rural', 'Non-rural'], dtype=object)

## Obtaining API data

In [89]:
def obtain_API(job):
    """
    Make a request to the API and receive the json where the job title is stored
    """
    
    response = requests.get(f'http://api.dataatwork.org/v1/jobs/{job}')
    
    results = response.json()
    
    if pd.isna(job):
        return 'Not job found'
    else:
        return results

In [90]:
df_processed['job_extracted'] = df_processed.head(13).apply(lambda x: obtain_API(x['Job Title']), axis=1)

In [91]:
def acces_job(job):
    """
    Get the job title stored in data obtained from the API
    """
    
    if 'title' in job:
        return job['title']
    else:
        return job

In [92]:
df_processed['Job Title'] = df_processed.head(13).apply(lambda x: acces_job(x.job_extracted), axis=1)

In [93]:
df_API_collected = df_processed.drop(columns='job_extracted')

## DATA ANALYSIS

In [94]:
df = df_API_collected.groupby(['Country', 'Rural', 'Job Title'])['Job Title'].count().reset_index(name="Quantity")

In [95]:
df

Unnamed: 0,Country,Rural,Job Title,Quantity
0,AT,Non-rural,Automatic Data Processing Planner,1
1,AT,Non-rural,Crime Data Specialist,1
2,AT,Non-rural,Data Communications Software Consultant,1
3,AT,Non-rural,Database Architect,1
4,AT,Non-rural,Database Developer,1
5,AT,Non-rural,Geographic Information Systems Database Admini...,1
6,AT,Non-rural,Not job found,4
7,AT,Rural,Data Coordinator,1
8,AT,Rural,Data Entry Specialist,1
9,AT,Rural,Not job found,1


In [102]:
def calc_percentage(quantity, dataframe_column):
    percentage = quantity / dataframe_column.sum() * 100
    return round(percentage, 1)

In [103]:
def percentage(df):
    """
    calculate the percentage of a job (rural or non-rural) in a country with respect to the total of all countries 
    """
    
    df['Percentage'] = df.apply(lambda x: f'{calc_percentage(x.Quantity, df.Quantity)}%', axis=1)
    
    return df

In [104]:
percentage(df)

Unnamed: 0,Country,Rural,Job Title,Quantity,Percentage
0,AT,Non-rural,Automatic Data Processing Planner,1,7.7%
1,AT,Non-rural,Crime Data Specialist,1,7.7%
2,AT,Non-rural,Data Communications Software Consultant,1,7.7%
3,AT,Non-rural,Database Architect,1,7.7%
4,AT,Non-rural,Database Developer,1,7.7%
5,AT,Non-rural,Geographic Information Systems Database Admini...,1,7.7%
6,AT,Non-rural,Not job found,4,30.8%
7,AT,Rural,Data Coordinator,1,7.7%
8,AT,Rural,Data Entry Specialist,1,7.7%
9,AT,Rural,Not job found,1,7.7%


In [115]:
data

Unnamed: 0,Country,Rural,Job Title,Quantity,Percentage
0,AT,Non-rural,Automatic Data Processing Planner,1,20.0%
1,AT,Non-rural,Database Developer,1,20.0%
2,AT,Non-rural,Not job found,1,20.0%
3,AT,Rural,Data Coordinator,1,20.0%
4,AT,Rural,Not job found,1,20.0%


In [109]:
data = pd.read_csv('../data/results/df_analysed.csv')
    
country = (input('Please introduce a country (\'all\' for complete data): '))

Please introduce a country ('all' for complete data):  AT


In [113]:
country

'AT'

In [117]:
country_filter = data.loc[data['Country'] == country]

In [120]:
country_filter

Unnamed: 0,Country,Rural,Job Title,Quantity,Percentage
0,AT,Non-rural,Automatic Data Processing Planner,1,20.0%
1,AT,Non-rural,Database Developer,1,20.0%
2,AT,Non-rural,Not job found,1,20.0%
3,AT,Rural,Data Coordinator,1,20.0%
4,AT,Rural,Not job found,1,20.0%
