<a href="https://colab.research.google.com/github/AnhQuocVo/ML-for-Research-Paper/blob/main/Get_Data_API_WDI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


World Bank Data360 API Data Fetcher
Fetches multiple indicators from the World Bank Data360 API

Required libraries:
pip install requests pandas

Usage:
1. Run the script to fetch all specified indicators
2. Data will be saved to CSV files
3. Combined data will be saved to 'combined_wdi_data.csv'


In [10]:
import requests
import pandas as pd
import time
from typing import List, Dict, Optional

In [11]:
class Data360APIClient:
    """Client for interacting with World Bank Data360 API"""

    BASE_URL = "https://data360api.worldbank.org"

    def __init__(self):
        self.session = requests.Session()

    def get_data(self,
                 database_id: str,
                 indicator: str,
                 ref_area: Optional[str] = None,
                 time_period_from: Optional[str] = None,
                 time_period_to: Optional[str] = None,
                 skip: int = 0) -> Dict:
        """
        Fetch data from the Data360 API

        Args:
            database_id: Database identifier (e.g., 'WB_WDI')
            indicator: Indicator ID (e.g., 'WB_WDI_SP_URB_TOTL_IN_ZS')
            ref_area: Country/region code (optional)
            time_period_from: Start year (optional)
            time_period_to: End year (optional)
            skip: Number of records to skip for pagination

        Returns:
            Dictionary containing the API response
        """
        endpoint = f"{self.BASE_URL}/data360/data"

        params = {
            'DATABASE_ID': database_id,
            'INDICATOR': indicator,
            'skip': skip
        }

        if ref_area:
            params['REF_AREA'] = ref_area
        if time_period_from:
            params['timePeriodFrom'] = str(time_period_from) # Ensure timePeriodFrom is a string
        if time_period_to:
            params['timePeriodTo'] = str(time_period_to) # Ensure timePeriodTo is a string


        try:
            response = self.session.get(endpoint, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {indicator}: {e}")
            return None

    def get_all_data(self,
                     database_id: str,
                     indicator: str,
                     ref_area: Optional[str] = None,
                     time_period_from: Optional[str] = None,
                     time_period_to: Optional[str] = None) -> pd.DataFrame:
        """
        Fetch all data for an indicator (handles pagination)

        Returns:
            DataFrame containing all records
        """
        all_data = []
        skip = 0
        max_records_per_call = 1000

        print(f"Fetching data for indicator: {indicator}")

        while True:
            result = self.get_data(
                database_id=database_id,
                indicator=indicator,
                ref_area=ref_area,
                time_period_from=time_period_from,
                time_period_to=time_period_to,
                skip=skip
            )

            if not result or 'value' not in result:
                break

            records = result['value']
            if not records:
                break

            all_data.extend(records)
            print(f"  Fetched {len(records)} records (total: {len(all_data)})")

            # Check if we've fetched all records
            if len(records) < max_records_per_call:
                break

            skip += max_records_per_call
            time.sleep(0.5)  # Be polite to the API

        if all_data:
            df = pd.DataFrame(all_data)
            print(f"  Total records fetched: {len(df)}")
            return df
        else:
            print(f"  No data found")
            return pd.DataFrame()


def main():
    """Main function to fetch all specified indicators"""

    # Initialize API client
    client = Data360APIClient()

    # Define indicators to fetch
    indicators = [
        # DEPENDENT VARIABLES
        {
            'name': 'GDP (annual % growth)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_MKTP_KD_ZG'
        },
        {
            'name': 'GDP (constant LCU)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_MKTP_KN'
        },
        {
            'name': 'GDP per capita (current US$)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_PCAP_CD'
        },
        {
            'name': 'GDP per capita (annual % growth)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_PCAP_KD_ZG'
        },
        # INSTITUTIONS INDEX
        {
            "name": "Control of Corruption: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_CC_EST"
        },
        {
            "name": "Government Effectiveness: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_GE_EST"
        },
        {
            "name": "Political Stability and Absence of Violence/Terrorism: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_PV_EST"
        },
        {
            "name": "Regulatory Quality: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_RQ_EST"
        },
        {
            "name": "Rule of Law: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_RL_EST"
        },
        {
            "name": "Voice and Accountability: Estimate",
            "database_id": "WB_WDI",
            "indicator": "WB_WDI_VA_EST"
        },
        # DIGITAL TRANSFORMATION - INVESTMENT
        {
            'name': 'Research and development expenditure (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_GB_XPD_RSDV_GD_ZS'
        },
        {
            'name': 'Government expenditure on education, total (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SE_XPD_TOTL_GD_ZS'
        },

        # INTELLECTUAL CAPITAL - HUMAN CAPITAL
        {
            'name': 'School enrollment, tertiary (% gross)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SE_TER_ENRR'
        },
        {
            'name': 'Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SE_PRM_CUAT_ZS'
        },
        {
            'name': 'Labor force with advanced education (% of total working-age population with advanced education)',
            'database_id': 'WEF_TTDI',
            'indicator': 'WEF_TTDI_LABOUREDUADV'
        },

        {
            'name': 'Individuals with ICT skills in problem solving',
            'database_id': 'ITU_DH',
            'indicator': 'ITU_DH_SKLS_PRB_SOLV'
        },
        {
            'name': 'Individuals with ICT skills in information and data literacy',
            'database_id': 'ITU_DH',
            'indicator': ' ITU_DH_SKLS_INF_DATA'
        },
        # INTELLECTUAL CAPITAL - STRUCTURAL CAPITAL
        {
            'name': 'Patent applications, residents',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IP_PAT_RESD'
        },
        {
            'name': 'Patent applications, nonresidents',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IP_PAT_NRES'
        },
        {
            'name': 'Scientific and technical journal articles',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IP_JRN_ARTC_SC'
        },
        {
            'name': 'High-technology exports (% of manufactured exports)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_TX_VAL_TECH_MF_ZS'
        },
        {
            'name': 'Manufacturing, value added (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NV_IND_MANF_ZS'
        },

        # INTELLECTUAL CAPITAL - RELATIONAL CAPITAL
        {
            'name': 'Foreign direct investment, net inflows (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_BX_KLT_DINV_WD_GD_ZS'
        },
        {
            'name': 'Foreign direct investment, net inflows (BoP, current US$)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_BX_KLT_DINV_CD_WD'
        },
        {
            'name': 'Trade (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NE_TRD_GNFS_ZS'
        },
        {
            'name': 'Charges for the use of intellectual property, receipts (BoP, current US$)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_BX_GSR_ROYL_CD'
        },
        {
            'name': 'International tourism, receipts (% of total exports)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_ST_INT_RCPT_XP_ZS'
        },

        # AI ABSORPTIVE CAPACITY - INFRASTRUCTURE
        {
            'name': 'Fixed broadband subscriptions (per 100 people)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IT_NET_BBND_P2'
        },
        {
            'name': 'Secure Internet servers (per 1 million people)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IT_NET_SECR_P6'
        },
        {
            'name': 'Individuals using the Internet (% of population)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IT_NET_USER_ZS'
        },
        {
            'name': 'Mobile cellular subscriptions (per 100 people)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_IT_CEL_SETS_P2'
        },

        # AI ABSORPTIVE CAPACITY - MARKET ADVANTAGE
        {
            'name': 'ICT service exports (% of service exports, BoP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_BX_GSR_CCIS_ZS'
        },
        {
            'name': 'ICT goods exports (% of total goods exports)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_TX_VAL_ICTG_ZS_UN'
        },
        {
            'name': 'ICT goods imports (% total goods imports)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_TM_VAL_ICTG_ZS_UN'
        },

        # CONTROL VARIABLES
        {
            'name': 'Industry (including construction), value added (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NV_IND_TOTL_ZS'
        },
        {
            'name': 'Urban population (% of total population)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SP_URB_TOTL_IN_ZS'
        },
        {
            'name': 'Urban population (% of total population)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SP_URB_TOTL_IN_ZS'
        },
        {
            'name': 'Population, total',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SP_POP_TOTL'
        },
        {
            'name': 'Inflation, consumer prices (annual %)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_FP_CPI_TOTL_ZG'
        },
        {
            'name': 'Gross capital formation (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NE_GDI_TOTL_ZS'
        },
        {
            'name': 'Electric power consumption (kWh per capita)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_EG_USE_ELEC_KH_PC'
        },
        {
            'name': 'Access to electricity (% of population)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_EG_ELC_ACCS_ZS'
        }
      ]

    # Set time period filter to start from 2010
    TIME_PERIOD_FROM = 2009
    TIME_PERIOD_TO = None
    REF_AREA = None

    # Fetch data for each indicator
    all_dataframes = {}

    for ind in indicators:
        print(f"\n{'='*60}")
        print(f"Processing: {ind['name']}")
        print(f"{'='*60}")

        df = client.get_all_data(
            database_id=ind['database_id'],
            indicator=ind['indicator'],
            ref_area=REF_AREA,
            time_period_from=TIME_PERIOD_FROM,
            time_period_to=TIME_PERIOD_TO
        )

        if not df.empty:
            # Define the required columns to keep for merging and the observation value
            required_columns = ['REF_AREA', 'TIME_PERIOD', 'OBS_VALUE']
            # Select only the desired columns if they exist in the DataFrame
            cols_to_select = [col for col in required_columns if col in df.columns]
            df_selected = df[cols_to_select].copy()

            # Rename the 'OBS_VALUE' column to the indicator name if it exists
            if 'OBS_VALUE' in df_selected.columns:
                 df_selected = df_selected.rename(columns={'OBS_VALUE': ind['name']})

            # Save individual indicator data
            filename = f"{ind['indicator']}.csv"
            df_selected.to_csv(filename, index=False)
            print(f"  Saved to: {filename}")

            # Store for combining later
            all_dataframes[ind['name']] = df_selected

        time.sleep(1)  # Be polite to the API

    # Combine all data
    if all_dataframes:
        print(f"\n{'='*60}")
        print("Combining all data...")
        print(f"{'='*60}")

        # Get the first dataframe to use as the base for merging
        first_indicator_name = list(all_dataframes.keys())[0]
        combined_df = all_dataframes[first_indicator_name]

        # Define the merge keys
        merge_keys = ['REF_AREA', 'TIME_PERIOD']

        # Merge other dataframes, only including merge keys and the indicator column
        for name, df in list(all_dataframes.items())[1:]:
            cols_to_merge = merge_keys + [name]

            # Check if all columns to merge exist in the current dataframe
            cols_exist = all(col in df.columns for col in cols_to_merge)

            if cols_exist:
                combined_df = pd.merge(combined_df, df[cols_to_merge], on=merge_keys, how='outer')
            else:
                print(f"  Skipping merge for {name}: Required columns not found in the data.")


        combined_df.to_csv('combined_wdi_data.csv', index=False)
        print(f"Combined data saved to: combined_wdi_data.csv")
        print(f"Total records: {len(combined_df)}")

        # Display summary
        print("\n" + "="*60)
        print("Summary by Indicator:")
        print("="*60)
        for name, df in all_dataframes.items():
            print(f"{name}: {len(df)} records")

    print("\n✓ Data fetching completed!")


if __name__ == "__main__":
    main()


Processing: GDP (annual % growth)
Fetching data for indicator: WB_WDI_NY_GDP_MKTP_KD_ZG
  Fetched 1000 records (total: 1000)
  Fetched 1000 records (total: 2000)
  Fetched 1000 records (total: 3000)
  Fetched 1000 records (total: 4000)
  Fetched 1000 records (total: 5000)
  Fetched 1000 records (total: 6000)
  Fetched 1000 records (total: 7000)
  Fetched 1000 records (total: 8000)
  Fetched 1000 records (total: 9000)
  Fetched 1000 records (total: 10000)
  Fetched 1000 records (total: 11000)
  Fetched 1000 records (total: 12000)
  Fetched 1000 records (total: 13000)
  Fetched 1000 records (total: 14000)
  Fetched 114 records (total: 14114)
  Total records fetched: 14114
  Saved to: WB_WDI_NY_GDP_MKTP_KD_ZG.csv

Processing: GDP (constant LCU)
Fetching data for indicator: WB_WDI_NY_GDP_MKTP_KN
  Fetched 1000 records (total: 1000)
  Fetched 1000 records (total: 2000)
  Fetched 1000 records (total: 3000)
  Fetched 1000 records (total: 4000)
  Fetched 1000 records (total: 5000)
  Fetched 1

In [37]:
df = pd.read_csv("combined_wdi_data.csv")

# Rename columns for clarity and analysis
rename_dict = {
    'REF_AREA': 'country_code',
    'TIME_PERIOD': 'year',
    'GDP (annual % growth)': 'gdp_growth',
    'GDP (constant LCU)': 'gdp',
    'GDP per capita (current US$)': 'gdp_pc',
    'GDP per capita (annual % growth)': 'gdp_pc_growth',
    'Research and development expenditure (% of GDP)': 'rd_exp',
    'Government expenditure on education, total (% of GDP)': 'edu_exp',
    'School enrollment, tertiary (% gross)': 'ter_enr',
    'Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)': 'edu_attain',
    'Labor force with advanced education (% of total working-age population with advanced education)': 'labor_adv',
    'Patent applications, residents': 'patent_res',
    'Patent applications, nonresidents': 'patent_nonres',
    'Scientific and technical journal articles': 'sci_art',
    'High-technology exports (% of manufactured exports)': 'hitech_exp',
    'Manufacturing, value added (% of GDP)': 'manuf_value',
    'Foreign direct investment, net inflows (% of GDP)': 'fdi',
    'Foreign direct investment, net inflows (BoP, current US$)': 'fdi_usd',
    'Trade (% of GDP)': 'trade',
    'Charges for the use of intellectual property, receipts (BoP, current US$)': 'ip_receipts',
    'International tourism, receipts (% of total exports)': 'tourism_rcpt',
    'Fixed broadband subscriptions (per 100 people)': 'broadband',
    'Secure Internet servers (per 1 million people)': 'sec_srv',
    'Individuals using the Internet (% of population)': 'inet_usr',
    'Mobile cellular subscriptions (per 100 people)': 'mob_sub',
    'ICT service exports (% of service exports, BoP)': 'ictexp_serv',
    'ICT goods exports (% of total goods exports)': 'ictexp_goods',
    'ICT goods imports (% total goods imports)': 'ictimp_goods',
    'Industry (including construction), value added (% of GDP)': 'industry_value',
    'Urban population (% of total population)': 'urban_pop',
    'Population, total': 'pop',
    'Inflation, consumer prices (annual %)': 'infl',
    'Gross capital formation (% of GDP)': 'gr_cap',
    'Electric power consumption (kWh per capita)': 'elec_cons',
    'Access to electricity (% of population)': 'elec_access'
}

# Apply renaming to your dataframe
df.rename(columns=rename_dict, inplace=True)

print("\nColumns:")
display(df.columns)


Columns:


Index(['country_code', 'year', 'gdp_growth', 'gdp', 'gdp_pc', 'gdp_pc_growth',
       'Control of Corruption: Estimate', 'Government Effectiveness: Estimate',
       'Political Stability and Absence of Violence/Terrorism: Estimate',
       'Regulatory Quality: Estimate', 'Rule of Law: Estimate',
       'Voice and Accountability: Estimate', 'rd_exp', 'edu_exp', 'ter_enr',
       'edu_attain', 'labor_adv',
       'Individuals with ICT skills in problem solving', 'patent_res',
       'patent_nonres', 'sci_art', 'hitech_exp', 'manuf_value', 'fdi',
       'fdi_usd', 'trade', 'ip_receipts', 'tourism_rcpt', 'broadband',
       'sec_srv', 'inet_usr', 'mob_sub', 'ictexp_serv', 'ictexp_goods',
       'ictimp_goods', 'industry_value', 'urban_pop', 'pop', 'infl', 'gr_cap',
       'elec_cons', 'elec_access'],
      dtype='object')

In [34]:
# Filter years greater than 2010
df = df[df['year'] > 2009].copy()

print("\nDataFrame after filtering by year:")
display(df.head())


DataFrame after filtering by year:


Unnamed: 0,country_code,year,gdp_growth,gdp,gdp_pc,gdp_pc_growth,Control of Corruption: Estimate,Government Effectiveness: Estimate,Political Stability and Absence of Violence/Terrorism: Estimate,Regulatory Quality: Estimate,...,ictexp_serv,ictexp_goods,ictimp_goods,industry_value,urban_pop,pop,infl,gr_cap,elec_cons,elec_access
50,ABW,2010,-2.73346,4484930000.0,24093.1,-2.95695,1.1213,1.24645,1.10085,1.34484,...,1.4252,0.4,4.76,11.5461,43.059,101838.0,2.07814,28.2832,,93.4
51,ABW,2011,3.36924,4636040000.0,25712.4,2.61053,1.10295,1.23643,1.28574,1.31728,...,1.4723,0.42,4.63,12.3664,42.94,102591.0,4.3163,27.4166,,100.0
52,ABW,2012,-1.0408,4587790000.0,25119.7,-2.48465,1.09524,1.24759,1.26388,1.40213,...,1.26181,0.71,4.42,10.0761,42.957,104110.0,0.627472,23.5225,,100.0
53,ABW,2013,6.43148,4882850000.0,25813.6,4.85528,1.1196,1.22108,1.31223,1.42088,...,1.48731,0.41,5.27,10.7497,42.99,105675.0,-2.37207,23.8259,,100.0
54,ABW,2014,-1.58658,4805380000.0,26129.8,-2.62962,1.014,0.887494,1.16457,1.24536,...,1.22531,0.93,5.24,10.7995,43.041,106807.0,0.421441,23.1115,,100.0


## HDI data

In [35]:
# Fetch the data.
hdi1 = pd.read_csv("https://ourworldindata.org/grapher/average-years-of-schooling.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
hdi2 = pd.read_csv("https://ourworldindata.org/grapher/expected-years-of-schooling.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
income_gr = df = pd.read_csv("https://ourworldindata.org/grapher/world-bank-income-groups.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})

# Merge hdi1 and hdi2 first
df_hdi_combined = pd.merge(hdi1, hdi2, on=['Code', 'Year'], how='outer')

# Then merge the result with income_gr
df_hdi_combined = pd.merge(df_hdi_combined, income_gr, on=['Code', 'Year'], how='outer')
# Rename columns for consistency before merging with the main dataframe
df_hdi_combined_renamed = df_hdi_combined.rename(columns={
    'Code': 'country_code',
    'Year': 'year',
    'mys__sex_total': 'avg_years_schooling',
    'eys__sex_total': 'expected_years_schooling'
})
df_hdi_combined_renamed
# # # Select only the relevant columns from the combined HDI dataframe
df_hdi_selected = df_hdi_combined_renamed[['country_code', 'year', 'avg_years_schooling', 'expected_years_schooling','Entity','classification']]
df_hdi_selected



Unnamed: 0,country_code,year,avg_years_schooling,expected_years_schooling,Entity,classification
0,ABW,1987,,,Aruba,High-income countries
1,ABW,1988,,,Aruba,High-income countries
2,ABW,1989,,,Aruba,High-income countries
3,ABW,1990,,,Aruba,High-income countries
4,ABW,1991,,,Aruba,Upper-middle-income countries
...,...,...,...,...,...,...
22314,,2023,12.52677,12.148221,,
22315,,2023,12.52677,10.345610,,
22316,,2023,12.52677,,,
22317,,2023,12.52677,15.055472,,


In [38]:
# Merge the combined HDI dataframe with the main combined_df
# Assuming 'combined_df' is available from the previous steps (data fetching and initial combination)

# cols_to_drop = ['avg_years_schooling_x', 'expected_years_schooling_x',
#                 'avg_years_schooling_y', 'expected_years_schooling_y']

# combined_df = combined_df.drop(columns=[col for col in cols_to_drop if col in combined_df.columns])


combined_df = pd.merge(df, df_hdi_selected, on=['country_code', 'year'], how='left')

display(combined_df.head())

Unnamed: 0,country_code,year,gdp_growth,gdp,gdp_pc,gdp_pc_growth,Control of Corruption: Estimate,Government Effectiveness: Estimate,Political Stability and Absence of Violence/Terrorism: Estimate,Regulatory Quality: Estimate,...,urban_pop,pop,infl,gr_cap,elec_cons,elec_access,avg_years_schooling,expected_years_schooling,Entity,classification
0,ABW,1960,,,,,,,,,...,50.776,54922.0,,,,,,,,
1,ABW,1961,,,,,,,,,...,50.761,55578.0,,,,,,,,
2,ABW,1962,,,,,,,,,...,50.746,56320.0,,,,,,,,
3,ABW,1963,,,,,,,,,...,50.73,57002.0,,,,,,,,
4,ABW,1964,,,,,,,,,...,50.715,57619.0,,,,,,,,


In [39]:
# EDA
# Describe the data
print("Descriptive Statistics:")
display(combined_df.describe())

# Check for null values
print("\nNull Values per Column:")
display(combined_df.isnull().sum().sort_values(ascending=False))

# Check data types
print("\nData Types:")
display(combined_df.dtypes)

print("\nColumns:")
display(combined_df.columns)


Descriptive Statistics:


Unnamed: 0,year,gdp_growth,gdp,gdp_pc,gdp_pc_growth,Control of Corruption: Estimate,Government Effectiveness: Estimate,Political Stability and Absence of Violence/Terrorism: Estimate,Regulatory Quality: Estimate,Rule of Law: Estimate,...,ictimp_goods,industry_value,urban_pop,pop,infl,gr_cap,elec_cons,elec_access,avg_years_schooling,expected_years_schooling
count,18109.0,14985.0,12264.0,15412.0,14985.0,5587.0,5563.0,5624.0,5565.0,5677.0,...,5185.0,11931.0,17979.0,18079.0,7146.0,11615.0,6999.0,8471.0,6680.0,6909.0
mean,1993.439837,3.702528,110602700000000.0,9397.798636,2.018712,-0.001615,0.009581,-0.014088,0.014901,0.004093,...,7.400689,27.5041,51.290572,211637000.0,10.891793,23.608135,3573.150164,82.132306,8.107352,12.341322
std,19.375747,6.064293,942096400000000.0,18096.537182,5.876545,0.998391,0.987862,0.979663,0.991667,0.991797,...,6.054451,11.916661,24.811731,700391800.0,71.660505,7.99302,4936.287471,27.877325,3.318938,3.499136
min,1960.0,-64.0471,14539800.0,11.8013,-64.4236,-1.96956,-2.44023,-3.31295,-2.54773,-2.59088,...,0.0,0.0,2.077,2715.0,-16.8597,-15.6784,0.0,0.533899,0.323121,1.94209
25%,1977.0,1.47416,22364620000.0,630.999,-0.153622,-0.767534,-0.728376,-0.637513,-0.697195,-0.765444,...,3.75,19.9226,30.9915,1215785.0,2.05113,19.0397,568.289,72.88275,5.579844,10.3737
50%,1994.0,3.80365,340327500000.0,2145.505,2.16452,-0.22704,-0.120211,0.078406,-0.090558,-0.129784,...,5.47,26.1259,50.08,7140270.0,3.78792,23.1153,1952.02,99.0,8.44,12.377304
75%,2011.0,6.06953,2901378000000.0,9194.505,4.38232,0.695355,0.7291,0.816197,0.780194,0.809141,...,8.98,33.1872,71.09,46967600.0,7.438905,27.3918,4940.65,100.0,10.836622,14.89571
max,2024.0,149.973,1.71232e+16,256581.0,140.491,2.45912,2.46966,1.96421,2.30859,2.12476,...,57.53,97.5223,100.0,8142060000.0,2947.73,76.7823,55085.2,100.0,14.296371,23.24768



Null Values per Column:


Unnamed: 0,0
Individuals with ICT skills in problem solving,17781
labor_adv,16921
edu_attain,15166
rd_exp,14645
hitech_exp,14302
patent_res,13391
sec_srv,13312
tourism_rcpt,13187
ictexp_goods,13120
patent_nonres,13066



Data Types:


Unnamed: 0,0
country_code,object
year,int64
gdp_growth,float64
gdp,float64
gdp_pc,float64
gdp_pc_growth,float64
Control of Corruption: Estimate,float64
Government Effectiveness: Estimate,float64
Political Stability and Absence of Violence/Terrorism: Estimate,float64
Regulatory Quality: Estimate,float64



Columns:


Index(['country_code', 'year', 'gdp_growth', 'gdp', 'gdp_pc', 'gdp_pc_growth',
       'Control of Corruption: Estimate', 'Government Effectiveness: Estimate',
       'Political Stability and Absence of Violence/Terrorism: Estimate',
       'Regulatory Quality: Estimate', 'Rule of Law: Estimate',
       'Voice and Accountability: Estimate', 'rd_exp', 'edu_exp', 'ter_enr',
       'edu_attain', 'labor_adv',
       'Individuals with ICT skills in problem solving', 'patent_res',
       'patent_nonres', 'sci_art', 'hitech_exp', 'manuf_value', 'fdi',
       'fdi_usd', 'trade', 'ip_receipts', 'tourism_rcpt', 'broadband',
       'sec_srv', 'inet_usr', 'mob_sub', 'ictexp_serv', 'ictexp_goods',
       'ictimp_goods', 'industry_value', 'urban_pop', 'pop', 'infl', 'gr_cap',
       'elec_cons', 'elec_access', 'avg_years_schooling',
       'expected_years_schooling', 'Entity', 'classification'],
      dtype='object')

In [40]:
combined_df.to_csv('filtered_wdi_data.csv', index=False)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
