# Merging the Data

### Reading in Data and Preparing to Merge

In [1]:
import pandas as pd
OECDdf = pd.read_csv('OECD.csv')
WBdf = pd.read_csv('worldbank_data_final.csv')
WEOdf = pd.read_csv('weo_merged.csv', na_values='--')

# First column of WBdf is old index, which can be dropped
WBdf.drop(WBdf.columns[0], axis=1, inplace=True)

# Renaming columns so that they are an exact match
WBdf.rename({'country':'Country', 'year':'Year'}, axis=1, inplace=True)

In [2]:
# Fixing some country names in WEO data
update_dic = {'Hong Kong SAR':'Hong Kong',
              'Taiwan Province of China':'Taiwan, Province of China',
              'Czech Republic':'Czechia',
              'Democratic Republic of the Congo':'Congo, The Democratic Republic of the',
              'Islamic Republic of Iran':'Iran, Islamic Republic of',
              'Korea':'Korea, Republic of',
              'Micronesia':'Micronesia, Federated States of',
              'Moldova':'Moldova, Republic of',
              'Russia':'Russian Federation',
              'Slovak Republic':'Slovakia',
              'Tanzania':'Tanzania, United Republic of',
              'Venezuela':'Venezuela, Bolivarian Republic of',
              'Vietnam':'Viet Nam',
              'Kyrgyz Republic':'Kyrgyzstan',
              'Lao P.D.R.':"Lao People's Democratic Republic",
              'Macao SAR':'Macao',
              'Republic of Congo':'Congo',
              'São Tomé and Príncipe':'Sao Tome and Principe',
              'St. Kitts and Nevis':'Saint Kitts and Nevis',
              'St. Lucia':'Saint Lucia',
              'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines',
              'Syria':'Syrian Arab Republic',
              'The Bahamas':'Bahamas',
              'The Gambia':'Gambia'}
WEOdf = WEOdf.replace({"Country": update_dic})
WEOdf.sort_values(['Country', 'Year'], inplace=True)
WEOdf.reset_index(drop=True, inplace=True)

In [3]:
# Dropping GDP and population columns from WEO data, since we have that in WB data
WEOdf.drop(['NGDP_R', 'NGDP_RPCH', 'NGDP_FY', 'LP'], axis=1, inplace=True)

# Renaming some columns to be more readable
update_dic = {'PPPEX':'PPP_Conv_Rate',
             'PPPSH':'PPP_Share_GDP',
             'TM_RPCH':'Imports_PC',
             'TX_RPCH':'Exports_PC',
             'LUR':'Unemp_Rate',
             'LE':'Employment',
             'GGXONLB_NGDP':'Govt_Revenue'}
WEOdf.rename(update_dic, axis=1, inplace=True)

### Merging the Data

In [4]:
merge1 = pd.merge(left=WBdf, right=OECDdf, how='outer',
                  left_on=['Country', 'Year'], right_on=['Country', 'Year'])
merged = pd.merge(left=merge1, right=WEOdf, how='outer',
                 left_on=['Country', 'Year'], right_on=['Country', 'Year'])

merged.head()

Unnamed: 0,Country,Year,gdp_percap,agripercent_gdp,agg_empl_agri_perc,rural_pop_tot,totalpop,mobilesub_per100peeps,intl_tourist_arrival,total_life_exp,...,life_exp_male,GDP_per_unit_CO2,PPP_Conv_Rate,PPP_Share_GDP,FLIBOR6,Imports_PC,Exports_PC,Unemp_Rate,Employment,Govt_Revenue
0,Albania,2018,5079.40112,18.423884,37.997002,39.681,2866376.0,94.176998,5340000.0,71.444569,...,69.361534,7.575459,,,,,,,,
1,Albania,2017,4865.209546,19.014329,38.203999,40.617,2873457.0,125.710352,4643000.0,78.333,...,76.601,7.575459,,,,,,,,
2,Albania,2016,4681.840039,19.849976,39.786999,41.579,2876101.0,116.744444,4070000.0,78.194,...,76.353,8.622021,43.29,0.028,,8.489,10.389,15.2,,0.7
3,Albania,2015,4524.386108,19.780225,41.362999,42.566,2880703.0,117.659218,3784000.0,78.025,...,76.066,8.013206,43.936,0.028,,0.154,5.284,17.1,,-1.363
4,Albania,2014,4413.309627,19.990154,42.889,43.577,2889104.0,115.997935,3341000.0,77.813,...,75.734,7.27187,44.157,0.028,,5.374,3.295,17.5,,-2.591


### Inspecting the Data

In [5]:
# I'm going to drop FLIBOR6 and Employment, since they are missing so many.
# The rest I will leave for now, but we should probably limit
# the years/countries or only work with the WB data.

# I also need to fix the columns that are objects instead of floats
# due to the comma for the numbers >= 1000.
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10959 entries, 0 to 10958
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                10959 non-null  object 
 1   Year                   10959 non-null  int64  
 2   gdp_percap             10030 non-null  float64
 3   agripercent_gdp        10030 non-null  float64
 4   agg_empl_agri_perc     10030 non-null  float64
 5   rural_pop_tot          10030 non-null  float64
 6   totalpop               10030 non-null  float64
 7   mobilesub_per100peeps  10030 non-null  float64
 8   intl_tourist_arrival   10030 non-null  float64
 9   total_life_exp         10030 non-null  float64
 10  life_exp_fe            10030 non-null  float64
 11  life_exp_male          10030 non-null  float64
 12  GDP_per_unit_CO2       3706 non-null   float64
 13  PPP_Conv_Rate          6301 non-null   object 
 14  PPP_Share_GDP          6300 non-null   float64
 15  FL

In [6]:
# Dropping FLIBOR6 and LE
merged2 = merged.drop(['FLIBOR6', 'Employment'], axis=1)

# Fixing object columns
merged2['PPP_Conv_Rate'] = merged2['PPP_Conv_Rate'].str.replace(',', '').astype(float)

merged2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10959 entries, 0 to 10958
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                10959 non-null  object 
 1   Year                   10959 non-null  int64  
 2   gdp_percap             10030 non-null  float64
 3   agripercent_gdp        10030 non-null  float64
 4   agg_empl_agri_perc     10030 non-null  float64
 5   rural_pop_tot          10030 non-null  float64
 6   totalpop               10030 non-null  float64
 7   mobilesub_per100peeps  10030 non-null  float64
 8   intl_tourist_arrival   10030 non-null  float64
 9   total_life_exp         10030 non-null  float64
 10  life_exp_fe            10030 non-null  float64
 11  life_exp_male          10030 non-null  float64
 12  GDP_per_unit_CO2       3706 non-null   float64
 13  PPP_Conv_Rate          6301 non-null   float64
 14  PPP_Share_GDP          6300 non-null   float64
 15  Im

In [7]:
merged2.head()

Unnamed: 0,Country,Year,gdp_percap,agripercent_gdp,agg_empl_agri_perc,rural_pop_tot,totalpop,mobilesub_per100peeps,intl_tourist_arrival,total_life_exp,life_exp_fe,life_exp_male,GDP_per_unit_CO2,PPP_Conv_Rate,PPP_Share_GDP,Imports_PC,Exports_PC,Unemp_Rate,Govt_Revenue
0,Albania,2018,5079.40112,18.423884,37.997002,39.681,2866376.0,94.176998,5340000.0,71.444569,73.735069,69.361534,7.575459,,,,,,
1,Albania,2017,4865.209546,19.014329,38.203999,40.617,2873457.0,125.710352,4643000.0,78.333,80.148,76.601,7.575459,,,,,,
2,Albania,2016,4681.840039,19.849976,39.786999,41.579,2876101.0,116.744444,4070000.0,78.194,80.134,76.353,8.622021,43.29,0.028,8.489,10.389,15.2,0.7
3,Albania,2015,4524.386108,19.780225,41.362999,42.566,2880703.0,117.659218,3784000.0,78.025,80.107,76.066,8.013206,43.936,0.028,0.154,5.284,17.1,-1.363
4,Albania,2014,4413.309627,19.990154,42.889,43.577,2889104.0,115.997935,3341000.0,77.813,80.045,75.734,7.27187,44.157,0.028,5.374,3.295,17.5,-2.591


In [8]:
# Going to limit the years to 1990-2016, so that they
# match what we have in the three datasets.

# Get rid of years > 2016
indexNames = merged2[merged2['Year']>2016].index
merged3 = merged2.drop(indexNames)

# Get rid of years < 1990
indexNames = merged2[merged2['Year']<1990].index
merged3.drop(indexNames, inplace=True)

merged3.sort_values(['Country', 'Year'], inplace=True)
merged3.reset_index(drop=True, inplace=True)
merged3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5265 entries, 0 to 5264
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                5265 non-null   object 
 1   Year                   5265 non-null   int64  
 2   gdp_percap             4590 non-null   float64
 3   agripercent_gdp        4590 non-null   float64
 4   agg_empl_agri_perc     4590 non-null   float64
 5   rural_pop_tot          4590 non-null   float64
 6   totalpop               4590 non-null   float64
 7   mobilesub_per100peeps  4590 non-null   float64
 8   intl_tourist_arrival   4590 non-null   float64
 9   total_life_exp         4590 non-null   float64
 10  life_exp_fe            4590 non-null   float64
 11  life_exp_male          4590 non-null   float64
 12  GDP_per_unit_CO2       3456 non-null   float64
 13  PPP_Conv_Rate          4928 non-null   float64
 14  PPP_Share_GDP          4842 non-null   float64
 15  Impo

### Writing to CSV

In [9]:
merged3.to_csv('merged.csv', index=False)