In [2]:
from helper_functions import *
import pandas as pd
import numpy as np

### Make a list of all countries in the dataset

In [3]:
occ_list = import_occupations("data/occupations_extracted.csv")
nids = []
country_pairs = []
for occupation in occ_list:
        name = occupation[0]
        df = pd.read_csv(f'data/csv_clean/{name}.csv')
        for index, row in df.iterrows():
            nationality = row['nationality']
            nid = row['nationalityID']
            if nid not in nids:
                pair = [nid, nationality]
                nids.append(nid)
                country_pairs.append(pair)

df = pd.DataFrame(country_pairs)
df.to_csv('data/country_list.csv', index=False, header=False)


### Annotate all countries with their corresponding continent

First, generate the list of continents associated with a country and add them all into one dataframe. Then associate those, that are not associated with a continent, with the unknown symbol '?'. Then find those, that have been added with multiple continents due to the hierarchical structure of data in wikidata and reasses by getting only their original countries continent (by adjusting their query).

In [4]:
country_list = pd.read_csv('data/country_list.csv', header=None)
for index, row in country_list.iterrows():
    cid = row[0]
    if cid != '?':
        results = get_continent(cid)
        continents = [result['continent']['value'] for result in results['results']['bindings']]
        country_list.at[index, 'continents'] = ','.join(continents) if continents else '?'

        #for i, result in enumerate(results['results']['bindings']):
        #    country_list.at[index,'continent'+str(i)] = result['continent']['value']
country_list.to_csv('data/country_list_continents.csv', index=False, header=False)

country_list = pd.read_csv('data/country_list_continents.csv', header=None)
print("Multiple continents associated: ")
for index, row in country_list.iterrows():
    if not isinstance(row[2], str):
        country_list.at[index, 2] = '?'
country_list.to_csv('data/country_list_continents.csv', index=False, header=False)
country_list = pd.read_csv('data/country_list_continents.csv', header=None)
for index, row in country_list.iterrows():
    if isinstance(row[3], str):
        results = get_continent(row[0], original=True)
        if len(results['results']['bindings']) == 1 :
            country_list.at[index, 2] = results['results']['bindings'][0]['continent']['value']
            for i in range(3,10):
                country_list.at[index, i] = np.nan
country_list.to_csv('data/country_list_continents.csv', index=False, header=False)

Multiple continents associated: 


KeyError: 3

Find all countries that are still associated with more than one continent and manually fix them.

In [4]:
country_list = pd.read_csv('data/country_list_continents.csv', header=None)
for index, row in country_list.iterrows():
    if isinstance(row[3], str):
        print(row[0], row[1])

Q15180 Soviet Union
Q804 Panama
Q34266 Russian Empire
Q664 New Zealand
Q730 Suriname
Q2002279 Portuguese Guinea
Q702 Federated States of Micronesia
Q139319 Russian Republic
Q12544 Byzantine Empire
Q12560 Ottoman Empire
Q685 Solomon Islands
Q23681 Northern Cyprus
Q217196 Crown of Castile
Q710 Kiribati
Q695 Palau
Q83891 Sasanian Empire
Q160307 Fatimid Caliphate
Q672 Tuvalu
Q63135869 Ayyubid Sultanate
Q282428 Mamluk Sultanate
Q8575586 Umayyad Caliphate
Q1275158 Elymais
Q389688 Achaemenid Empire
Q12490507 Rashidun Caliphate
Q41137 Assyrian Empire
Q180114 Ayyubid dynasty
Q170468 United Arab Republic
Q19083 Kingdom of Iberia
Q80702 Spanish Empire
Q8680 British Empire
Q200464 Portuguese Empire
Q210718 Asia
Q186513 Hispania
Q4304392 Russian State
Q1051600 Duchy of Naples
Q210551 Viceroyalty of the Río de la Plata
Q10416611 Vandal Kingdom
Q199688 Almohad Caliphate
Q766543 Hispanic Monarchy


The automatic preprocessing is followed by manual manipulation of the country file to make it as clean as possible and associate all countries with their continent. This yields the final ``country_list_continents.csv`` file.

### Make containers for age range
1. Get all the birthyears from the dataset

In [6]:
occ_list = import_occupations("data/occupations_updated.csv")
amount_containers = 5

2. clean the birthyears into being numbers (swapping BC with a negative value) and make a list of all years.

In [7]:
birthyears = []
for occupation in occ_list:
    id = occupation[1]
    name = occupation[0]
    occ_file = 'data/dataframes_cleaned/' + name + '.csv'
    df = pd.read_csv(occ_file)
    for index,row in df.iterrows():
        birthyear = row['birth']
        if isinstance(birthyear, str) and 'BC' in birthyear:
            birthyears.append(-int(birthyear.split(' ')[0]))
        elif birthyear == '?':
            pass
        else:
            birthyears.append(int(birthyear))

3. Sort the birthyear list.

In [8]:
birthyears.sort()

4. Partition the list into the given amount of containers, so that years that are represented more ofte, gain a finer grained representation.

In [9]:
amount_in_container = int((len(birthyears) / amount_containers)+1)
containers = []
for i in range(amount_containers - 1):
    c_value = birthyears[(i+1)*amount_in_container]
    print("{i} : {value}".format(i=i, value=c_value))
    containers.append(c_value)

0 : 1875
1 : 1925
2 : 1951
3 : 1970


In [17]:
pd.DataFrame(containers).to_csv('data/age_containers' + str(amount_containers) + '.csv', header=None, index=None)