In [1]:
# The link below has a csv with all WB inducators 
# http://databank.worldbank.org/data/download/WDI_csv.zip

In [2]:
import pandas as pd
import numpy as np
import string
import os

In [3]:
# Keep only countries and no aggregators (i.e. EU, High Income Contries)
countries = pd.read_csv('wb-countries.csv')

In [4]:
df_full = pd.read_csv('wb-data.csv')  # Read the large data file

In [5]:
# Separate large table into smaller tables, each being of a single indicator
dfs = {}
for name, group in df_full.groupby('Indicator Code'):
    dataset_id = name
    # Use only the 217 countries in list and make the country code the table index
    dfs[name] = group[group['Country Code'].isin(countries['Code'])]
    

In [6]:
n_indicators = len(dfs)
n_indicators

1580

In [7]:
# Iterate over all indicators (tables)
for dataset_id, df in dfs.items():
    n_indicators -= 1
    # Count number of non-empty cells and continue only if there is a minumum number of measurements
    if df.notnull().sum(axis=0).sum() > 217*4 + 1000:
        print(str(n_indicators) + '    ', end='\r', flush=True)
#         print(dataset_id)

        # Remove columns (years) with no data
        df.dropna(axis=1, how='all', inplace=True)


        # Add Region column to make a complete id with Region/Country
        df = df.merge(countries[['Code', 'Region']], left_on='Country Code', right_on='Code')
        df['Country Name']= df['Country Name'].str.replace('[{}]'.format(string.punctuation), '')  # Remove ponctuation from name
        df['id'] = df['Region'].str.replace(' ', '') + '/' + df['Country Name'].str.replace(' ', '')
#         display(df)
        
        # Iterate over years
        for col in df.columns:
            if col.isdigit():  # Not all columns represent years
                copy = df.loc[:, ['id', col]].dropna(axis=0).copy()
                copy.columns = ['id', 'weight']
                copy['weight'] = copy['weight'].abs()  # We can't have negative weights
                copy['weight'] = copy['weight'].replace({0:np.nan})
                copy.dropna(axis=0, how='any', inplace=True)

                os.makedirs('wb-datasets/wb-' + dataset_id, exist_ok=True)
                copy.to_csv('wb-datasets/wb-' + dataset_id + '/' + col + '.csv', index=False)
        

0       