# Merging the Data

### Reading in Data and Preparing to Merge

In [1]:
import pandas as pd
OECDdf = pd.read_csv('OECD.csv')
WBdf = pd.read_csv('worldbank_data_final.csv')
WEOdf = pd.read_csv('weo_merged.csv', na_values='--')

# First column of WBdf is old index, which can be dropped
WBdf.drop(WBdf.columns[0], axis=1, inplace=True)

# Renaming columns so that they are an exact match
WBdf.rename({'country':'Country', 'year':'Year'}, axis=1, inplace=True)

### Merging the Data

In [2]:
merge1 = pd.merge(left=WBdf, right=OECDdf, how='outer',
                  left_on=['Country', 'Year'], right_on=['Country', 'Year'])
merged = pd.merge(left=merge1, right=WEOdf, how='outer',
                 left_on=['Country', 'Year'], right_on=['Country', 'Year'])

merged.head()

Unnamed: 0,Country,Year,gdp_percap,agripercent_gdp,agg_empl_agri_perc,rural_pop_tot,totalpop,mobilesub_per100peeps,intl_tourist_arrival,total_life_exp,...,PPPEX,PPPSH,FLIBOR6,TM_RPCH,TX_RPCH,LUR,LE,LP,GGXONLB_NGDP,NGDP_FY
0,Albania,2018,5079.40112,18.423884,37.997002,39.681,2866376.0,94.176998,5340000.0,71.444569,...,,,,,,,,,,
1,Albania,2017,4865.209546,19.014329,38.203999,40.617,2873457.0,125.710352,4643000.0,78.333,...,,,,,,,,,,
2,Albania,2016,4681.840039,19.849976,39.786999,41.579,2876101.0,116.744444,4070000.0,78.194,...,43.29,0.028,,8.489,10.389,15.2,,2.876,0.7,1473.23
3,Albania,2015,4524.386108,19.780225,41.362999,42.566,2880703.0,117.659218,3784000.0,78.025,...,43.936,0.028,,0.154,5.284,17.1,,2.881,-1.363,1431.14
4,Albania,2014,4413.309627,19.990154,42.889,43.577,2889104.0,115.997935,3341000.0,77.813,...,44.157,0.028,,5.374,3.295,17.5,,2.889,-2.591,1392.41


### Inspecting the Data

In [3]:
# I'm going to drop FLIBOR6 and LE, since they are missing so many.
# The rest I will leave for now, but we should probably limit
# the years/countries or only work with the WB data.

# I also need to fix the columns that are objects instead of floats
# due to the comma for the numbers >= 1000.
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11615 entries, 0 to 11614
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                11615 non-null  object 
 1   Year                   11615 non-null  int64  
 2   gdp_percap             10030 non-null  float64
 3   agripercent_gdp        10030 non-null  float64
 4   agg_empl_agri_perc     10030 non-null  float64
 5   rural_pop_tot          10030 non-null  float64
 6   totalpop               10030 non-null  float64
 7   mobilesub_per100peeps  10030 non-null  float64
 8   intl_tourist_arrival   10030 non-null  float64
 9   total_life_exp         10030 non-null  float64
 10  life_exp_fe            10030 non-null  float64
 11  life_exp_male          10030 non-null  float64
 12  GDP_per_unit_CO2       3706 non-null   float64
 13  NGDP_R                 6399 non-null   object 
 14  NGDP_RPCH              6370 non-null   float64
 15  PP

In [4]:
# Dropping FLIBOR6 and LE
merged2 = merged.drop(['FLIBOR6', 'LE'], axis=1)

# Fixing object columns
merged2['NGDP_R'] = merged2['NGDP_R'].str.replace(',', '').astype(float)
merged2['PPPEX'] = merged2['PPPEX'].str.replace(',', '').astype(float)
merged2['LP'] = merged2['LP'].str.replace(',', '').astype(float)
merged2['NGDP_FY'] = merged2['NGDP_FY'].str.replace(',', '').astype(float)

merged2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11615 entries, 0 to 11614
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                11615 non-null  object 
 1   Year                   11615 non-null  int64  
 2   gdp_percap             10030 non-null  float64
 3   agripercent_gdp        10030 non-null  float64
 4   agg_empl_agri_perc     10030 non-null  float64
 5   rural_pop_tot          10030 non-null  float64
 6   totalpop               10030 non-null  float64
 7   mobilesub_per100peeps  10030 non-null  float64
 8   intl_tourist_arrival   10030 non-null  float64
 9   total_life_exp         10030 non-null  float64
 10  life_exp_fe            10030 non-null  float64
 11  life_exp_male          10030 non-null  float64
 12  GDP_per_unit_CO2       3706 non-null   float64
 13  NGDP_R                 6399 non-null   float64
 14  NGDP_RPCH              6370 non-null   float64
 15  PP

In [5]:
merged2.head()

Unnamed: 0,Country,Year,gdp_percap,agripercent_gdp,agg_empl_agri_perc,rural_pop_tot,totalpop,mobilesub_per100peeps,intl_tourist_arrival,total_life_exp,...,NGDP_R,NGDP_RPCH,PPPEX,PPPSH,TM_RPCH,TX_RPCH,LUR,LP,GGXONLB_NGDP,NGDP_FY
0,Albania,2018,5079.40112,18.423884,37.997002,39.681,2866376.0,94.176998,5340000.0,71.444569,...,,,,,,,,,,
1,Albania,2017,4865.209546,19.014329,38.203999,40.617,2873457.0,125.710352,4643000.0,78.333,...,,,,,,,,,,
2,Albania,2016,4681.840039,19.849976,39.786999,41.579,2876101.0,116.744444,4070000.0,78.194,...,760.426,3.347,43.29,0.028,8.489,10.389,15.2,2.876,0.7,1473.23
3,Albania,2015,4524.386108,19.780225,41.362999,42.566,2880703.0,117.659218,3784000.0,78.025,...,735.802,2.207,43.936,0.028,0.154,5.284,17.1,2.881,-1.363,1431.14
4,Albania,2014,4413.309627,19.990154,42.889,43.577,2889104.0,115.997935,3341000.0,77.813,...,719.913,1.784,44.157,0.028,5.374,3.295,17.5,2.889,-2.591,1392.41


### Writing to CSV

In [6]:
merged2.to_csv('merged.csv', index=False)