In [13]:
import pandas as pd
import os

from ddf_utils.index import create_index_file
from ddf_utils.str import to_concept_id

In [2]:
source = '../source/gdp_per_capita_cppp--by--geo--year--pivoted_datapoints.csv'

In [4]:
data = pd.read_csv(source, encoding='latin1')

In [5]:
data.head()

Unnamed: 0,geo,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,...,3200.0,3294.0,3390.0,3487.0,3586.0,3686.0,3788.0,3891.0,3995.0,4101.0
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,
3,Albania,667.0,667.0,668.0,668.0,668.0,668.0,668.0,668.0,668.0,...,21947.0,22796.0,23654.0,24521.0,25394.0,26273.0,27155.0,28039.0,28923.0,29806.0
4,Algeria,716.0,716.0,717.0,718.0,719.0,720.0,721.0,722.0,723.0,...,18671.0,19080.0,19501.0,19935.0,20382.0,20843.0,21318.0,21808.0,22313.0,22833.0


In [9]:
country = pd.read_csv('../../../ddf--gapminder--geo_entity_domain/ddf--entities--geo--country.csv')

In [10]:
data = data.set_index('geo')

In [11]:
data = data.stack().reset_index()

In [27]:
cname = 'GDP per capita, constant PPP'
# cid = to_concept_id(cname)
cid = 'gdp_per_capita_cppp'

In [28]:
data.columns = ['geo', 'time', cid]

In [29]:
data.head()

Unnamed: 0,geo,time,gdp_per_capita_cppp
0,afg,1800,603.0
1,afg,1801,603.0
2,afg,1802,603.0
3,afg,1803,603.0
4,afg,1804,603.0


In [17]:
geo_upper = data.geo.unique()

In [18]:
mapping = {}

for g in geo_upper:
    m0 = country['name'] == g
    m1 = country['gapminder_list'] == g
    
    m = m0 | m1
    filtered = country[m]
    if len(filtered) > 0:
        mapping[g] = filtered['country'].values[0]
    else:
        print('not found: ', g)

In [19]:
mapping

{'Afghanistan': 'afg',
 'Albania': 'alb',
 'Algeria': 'dza',
 'Andorra': 'and',
 'Angola': 'ago',
 'Antigua and Barbuda': 'atg',
 'Argentina': 'arg',
 'Armenia': 'arm',
 'Aruba': 'abw',
 'Australia': 'aus',
 'Austria': 'aut',
 'Azerbaijan': 'aze',
 'Bahamas': 'bhs',
 'Bahrain': 'bhr',
 'Bangladesh': 'bgd',
 'Barbados': 'brb',
 'Belarus': 'blr',
 'Belgium': 'bel',
 'Belize': 'blz',
 'Benin': 'ben',
 'Bermuda': 'bmu',
 'Bhutan': 'btn',
 'Bolivia': 'bol',
 'Bosnia and Herzegovina': 'bih',
 'Botswana': 'bwa',
 'Brazil': 'bra',
 'Brunei': 'brn',
 'Bulgaria': 'bgr',
 'Burkina Faso': 'bfa',
 'Burundi': 'bdi',
 'Cambodia': 'khm',
 'Cameroon': 'cmr',
 'Canada': 'can',
 'Cape Verde': 'cpv',
 'Cayman Islands': 'cym',
 'Central African Republic': 'caf',
 'Chad': 'tcd',
 'Chile': 'chl',
 'China': 'chn',
 'Colombia': 'col',
 'Comoros': 'com',
 'Congo, Dem. Rep.': 'cod',
 'Congo, Rep.': 'cog',
 'Costa Rica': 'cri',
 "Cote d'Ivoire": 'civ',
 'Croatia': 'hrv',
 'Cuba': 'cub',
 'Cyprus': 'cyp',
 'Czech 

In [20]:
data.geo = data.geo.map(lambda x: mapping[x])

In [21]:
data.tail()

Unnamed: 0,geo,time,gdp_per_capita_constant_ppp
48709,ssd,2037,4419.0
48710,ssd,2038,4512.0
48711,ssd,2039,4610.0
48712,ssd,2040,4711.0
48713,sxm,2011,36327.0


In [23]:
data.geo.hasnans

False

In [30]:
data.to_csv('../../ddf--datapoints--{}--by--geo--time.csv'.format(cid), index=False, float_format='%.15g')

In [40]:
concepts = ['Name', 'Time', cname, 'Domain', 'Country', 'Indicator URL']
ids = ['name', 'time', cid, 'domain', 'geo', 'indicator_url']

cdf = pd.DataFrame({'concept': ids, 'name': concepts})

In [41]:
cdf['concept_type'] = 'string'


cdf.loc[1, 'concept_type'] = 'time'
cdf.loc[2, 'concept_type'] = 'measure'
cdf.loc[4, 'concept_type'] = 'entity_domain'

In [42]:
cdf.loc[2, 'indicator_url'] = 'https://github.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp'

In [43]:
cdf.to_csv('../../ddf--concepts.csv', index=False)

In [33]:
geo_df = pd.DataFrame.from_records(mapping, index=[0])

In [34]:
geo_df = geo_df.T

In [35]:
geo_df = geo_df.reset_index()

In [36]:
geo_df.columns = ['name', 'geo']

In [37]:
geo_df.to_csv('../../ddf--entities--geo.csv', index=False)

In [44]:
create_index_file('../../')

Unnamed: 0,key,value,file
0,concept,name,ddf--concepts.csv
1,concept,concept_type,ddf--concepts.csv
2,concept,indicator_url,ddf--concepts.csv
3,"geo,time",gdp_per_capita_cppp,ddf--datapoints--gdp_per_capita_cppp--by--geo-...
4,geo,name,ddf--entities--geo.csv


In [45]:
!validate-ddf ../../

[
{}]

