In [None]:
import pandas as pd
import numpy as np

In [None]:
#Import the WDI data
data=pd.read_csv("/Users/leoss/Downloads/P_Data_Extract_From_WDI_Database_Archives-1/wdi_data_capstone_2_11.csv")
data.drop(columns=['Series Code', 'Version Name', 'Version Code'], inplace=True)

#Standardise them into long format
id_cols = [
    'Country Name', 'Country Code', 'Series Name'
]
Values=[f"{i} "+f"[YR{i}]" for i in range(1991,2025)]
data_long = data.melt(
    id_vars=id_cols, 
    value_vars=Values,
    var_name='Year',  
    value_name='Value'    
)

#Coerce all non numerics into Missing data 
data_long['Value'] = pd.to_numeric(data_long['Value'], errors='coerce')
data_long['Year'] = data_long['Year'].str.extract(r'(\d{4})').astype(int)
final_cols = id_cols + ['Year', 'Value']
df_long = data_long[final_cols]
#print(data_long)

# If you are interested in dropping some variables, input them here
series_to_drop = ['Tax revenue (% of GDP)', 'Central government debt, total (% of GDP)']
df_cleaned = df_long[~df_long['Series Name'].isin(series_to_drop)]
#df_cleaned["Country Name"]

In [None]:
#Import Trade data on complexity
trade=pd.read_csv("/Users/leoss/Downloads/growth_proj_eci_rankings.csv")
columns1 = ["country_id",'growth_proj', 'in_rankings', 'eci_sitc', 'eci_rank_sitc','eci_rank_hs92', 'eci_hs12', 'eci_rank_hs12']
trade.drop(columns1, inplace=True, axis=1) 

#Standardise names and order
trade["Series Name"]="Economic Complexity"
trade = trade.rename(columns={
    "country_iso3_code": "Country Code",
    "Country":"Country Name",
    "year": "Year",
    "eci_hs92": "Value"
})
new_order = ['Country Code','Series Name', 'Year', 'Value' ]
trade = trade[new_order]

In [None]:
#Import imf dataset
ins = ["COUNTRY.ID", "COUNTRY", "INDICATOR", "TIME_PERIOD", 
       "OBS_VALUE", "SCALE.ID", "PRIMARY_DOMESTIC_CURRENCY"]
imf = pd.read_csv(
    "/Users/leoss/Downloads/WorldEconomicOutlook-1.csv", 
    usecols=ins
)
#Only include relevant variables
imf = imf[ins]

#Dropping all observations without an associated time period and rename 
imf["TIME_PERIOD"].dropna(inplace=True)
imf = imf.rename(columns={
    "COUNTRY.ID": "Country Code",
    "COUNTRY":"Country Name",
    "TIME_PERIOD": "Year",
    "INDICATOR":"Series Name",
    "OBS_VALUE": "Value"
})

#Extract value scaled
imf["Value"]=imf["Value"]*10**(imf["SCALE.ID"])

#Drop those as not needed anymore
imf.drop(columns=["SCALE.ID","PRIMARY_DOMESTIC_CURRENCY"], inplace=True)

#Choose variables of interest
raw_list = [
    "Gross domestic product (GDP), Per capita, purchasing power parity (PPP) international dollar, ICP benchmarks 2017-2021",
    "Current account balance (credit less debit), US dollar",
    "Revenue, General government, Percent of GDP"
]
variables_to_keep = [s.strip() for s in raw_list]
imf = imf[imf['Series Name'].isin(variables_to_keep)].query("Year>1990 & Year<2025")

In [None]:
#Get unique country codes from each database
codes_cleaned = set(df_cleaned['Country Code'].unique())
codes_trade = set(trade['Country Code'].unique())
codes_imf = set(imf['Country Code'].unique())
common_codes = codes_cleaned.intersection(codes_trade, codes_imf)
print(f"Found {len(common_codes)} countries common to all datasets.")

#Filter the original dataframes to keep only the common codes
df_cleaned_filtered = df_cleaned[df_cleaned['Country Code'].isin(common_codes)].copy()
trade_filtered = trade[trade['Country Code'].isin(common_codes)].copy()
imf_filtered = imf[imf['Country Code'].isin(common_codes)].copy()

#Final merge
final_dataset = pd.concat([df_cleaned_filtered, trade_filtered, imf_filtered], ignore_index=True)
final_dataset.sort_values(by=['Country Code', 'Year'], inplace=True)
final_dataset.dropna(subset=['Country Code'], inplace=True)
#final_dataset["Series Name"].value_counts()

##The final dataset will struggle with Name-Code matchings
"""
country_name_map = final_dataset.groupby('Country Code')['Country Name'].unique().reset_index()
def clean_name_list(name_list):
    clean_list = []
    for name in name_list:
        if pd.notna(name):
            clean_list.append(str(name))
    return clean_list

country_name_map['Cleaned_Names'] = country_name_map['Country Name'].apply(clean_name_list)
country_name_map['Valid_Name_Count'] = country_name_map['Cleaned_Names'].apply(len)

# Check for duplicate names for the same Country Code
conflicts_found = False
no_name_count = 0
single_name_count = 0

for index, row in country_name_map.iterrows():
    code = row['Country Code']
    names = row['Cleaned_Names']
    count = row['Valid_Name_Count']
    if count > 1:
        print(f"Code:'{code}' maps to multiple names: {names}")
        conflicts_found = True
    elif count == 0:
        print(f"Code': {code}' only has missing (NaN) Country Names associated with it.")
        no_name_count += 1
    else:
        single_name_count += 1

print(f"\nTotal codes with one unique name: {single_name_count}")
print(f"Total codes with no valid name: {no_name_count}")
print(f"Total unique codes checked: {len(country_name_map)}")
"""
#Manually fixing the names:
mapping={"AFG":'Afghanistan',
         'ARM':'Armenia',
         'AZE':'Azerbaijan',
         'BHR':'Bahrain',
         'BLR':'Belarus',
         'CHN':'China',
         'CIV':"Cote d'Ivoire",
         'COD':'Congo Dem. Rep.',
         'COG':'Congo',
         'EGY':"Egypt",
         'EST':'Estonia',
         'ETH':'Ethiopia',
         'GNQ':'Equatorial Guinea',
         'HKG':'Hong Kong',
         'HRV':'Croatia',
         'IRN':'Iran',
         'KAZ':'Kazakhstan',
         'LAO':'Laos',
         "LTU":'Lithuania',
         'LVA':'Latvia',
         'MDA':'Moldova',
         'KOR':'South Korea',
         'KAZ':'Kazakhstan',
         'MDG':'Madagascar',
         'MKD':'North Macedonia',
         'MOZ':'Mozambique',
         'MRT':'Mauritania',
         'NLD':'Netherlands',
         'POL':'Poland',
         'RUS':'Russia',
         'SRB':'Serbia',
         'SVN':'Slovenia',
         'SWZ':'Eswatini',
         'TJK':'Tajikistan',
         'TUR':'Turkey',
         'TWN':'Taiwan',
         'UZB':'Uzbekistan',
         'VEN':'Venezuela',
         'YEM':'Yemen'
         }
# Create a Series of new names from the map
new_names = final_dataset['Country Code'].map(mapping)
final_dataset['Country Name'] = new_names.fillna(final_dataset['Country Name'])

#Now generate code-name pairing, to fill missing country-names
valid_names = final_dataset.dropna(subset=['Country Name'])
code_to_name_map_df = valid_names.drop_duplicates(subset=['Country Code'])
clean_map_dict = pd.Series(
    code_to_name_map_df['Country Name'].values, 
    index=code_to_name_map_df['Country Code']
).to_dict()
filled_names = final_dataset['Country Code'].map(clean_map_dict)
final_dataset['Country Name'] = final_dataset['Country Name'].fillna(filled_names)