<a href="https://colab.research.google.com/github/AnhQuocVo/Algorithmic_Trading_Machine_Learning/blob/main/API_WDI_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
"""
World Bank Data360 API Data Fetcher
Fetches multiple indicators from the World Bank Data360 API

Required libraries:
pip install requests pandas

Usage:
1. Run the script to fetch all specified indicators
2. Data will be saved to CSV files
3. Combined data will be saved to 'combined_wdi_data.csv'
"""

import requests
import pandas as pd
import time
from typing import List, Dict, Optional

class Data360APIClient:
    """Client for interacting with World Bank Data360 API"""

    BASE_URL = "https://data360api.worldbank.org"

    def __init__(self):
        self.session = requests.Session()

    def get_data(self,
                 database_id: str,
                 indicator: str,
                 ref_area: Optional[str] = None,
                 time_period_from: Optional[str] = None,
                 time_period_to: Optional[str] = None,
                 skip: int = 0) -> Dict:
        """
        Fetch data from the Data360 API

        Args:
            database_id: Database identifier (e.g., 'WB_WDI')
            indicator: Indicator ID (e.g., 'WB_WDI_SP_URB_TOTL_IN_ZS')
            ref_area: Country/region code (optional)
            time_period_from: Start year (optional)
            time_period_to: End year (optional)
            skip: Number of records to skip for pagination

        Returns:
            Dictionary containing the API response
        """
        endpoint = f"{self.BASE_URL}/data360/data"

        params = {
            'DATABASE_ID': database_id,
            'INDICATOR': indicator,
            'skip': skip
        }

        if ref_area:
            params['REF_AREA'] = ref_area
        if time_period_from:
            params['timePeriodFrom'] = str(time_period_from) # Ensure timePeriodFrom is a string
        if time_period_to:
            params['timePeriodTo'] = str(time_period_to) # Ensure timePeriodTo is a string


        try:
            response = self.session.get(endpoint, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {indicator}: {e}")
            return None

    def get_all_data(self,
                     database_id: str,
                     indicator: str,
                     ref_area: Optional[str] = None,
                     time_period_from: Optional[str] = None,
                     time_period_to: Optional[str] = None) -> pd.DataFrame:
        """
        Fetch all data for an indicator (handles pagination)

        Returns:
            DataFrame containing all records
        """
        all_data = []
        skip = 0
        max_records_per_call = 1000

        print(f"Fetching data for indicator: {indicator}")

        while True:
            result = self.get_data(
                database_id=database_id,
                indicator=indicator,
                ref_area=ref_area,
                time_period_from=time_period_from,
                time_period_to=time_period_to,
                skip=skip
            )

            if not result or 'value' not in result:
                break

            records = result['value']
            if not records:
                break

            all_data.extend(records)
            print(f"  Fetched {len(records)} records (total: {len(all_data)})")

            # Check if we've fetched all records
            if len(records) < max_records_per_call:
                break

            skip += max_records_per_call
            time.sleep(0.5)  # Be polite to the API

        if all_data:
            df = pd.DataFrame(all_data)
            print(f"  Total records fetched: {len(df)}")
            return df
        else:
            print(f"  No data found")
            return pd.DataFrame()


def main():
    """Main function to fetch all specified indicators"""

    # Initialize API client
    client = Data360APIClient()

    # Define indicators to fetch
    indicators = [
        {
            'name': 'Urban population (% of total population)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SP_URB_TOTL_IN_ZS'
        },
        {
            'name': 'Urban population',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_SP_URB_TOTL'
        },
        {
            'name': 'GDP (annual % growth)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_MKTP_KD_ZG'
        },
        {
            'name': 'GDP (constant LCU)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_MKTP_KN'
        },
        {
            'name': 'GDP per capita (current US$)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_PCAP_CD'
        },
        {
            'name': 'GDP per capita (annual % growth)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NY_GDP_PCAP_KD_ZG'
        },
        {
            'name': 'Trade (% of GDP)',
            'database_id': 'WB_WDI',
            'indicator': 'WB_WDI_NE_TRD_GNFS_ZS'
        },
        {
            'name': 'Foreign Direct Investment (FDI)',
            'database_id': 'FAO_FDI',
            'indicator': 'FAO_FDI'  # Note: You may need to adjust this indicator ID based on the actual API structure
        }
    ]

    # Set time period filter to start from 2010
    TIME_PERIOD_FROM = 2010
    TIME_PERIOD_TO = None
    REF_AREA = None

    # Fetch data for each indicator
    all_dataframes = {}

    for ind in indicators:
        print(f"\n{'='*60}")
        print(f"Processing: {ind['name']}")
        print(f"{'='*60}")

        df = client.get_all_data(
            database_id=ind['database_id'],
            indicator=ind['indicator'],
            ref_area=REF_AREA,
            time_period_from=TIME_PERIOD_FROM,
            time_period_to=TIME_PERIOD_TO
        )

        if not df.empty:
            # Define the required columns to keep for merging and the observation value
            required_columns = ['REF_AREA', 'TIME_PERIOD', 'OBS_VALUE']
            # Select only the desired columns if they exist in the DataFrame
            cols_to_select = [col for col in required_columns if col in df.columns]
            df_selected = df[cols_to_select].copy()

            # Rename the 'OBS_VALUE' column to the indicator name if it exists
            if 'OBS_VALUE' in df_selected.columns:
                 df_selected = df_selected.rename(columns={'OBS_VALUE': ind['name']})

            # Save individual indicator data
            filename = f"{ind['indicator']}.csv"
            df_selected.to_csv(filename, index=False)
            print(f"  Saved to: {filename}")

            # Store for combining later
            all_dataframes[ind['name']] = df_selected

        time.sleep(1)  # Be polite to the API

    # Combine all data (optional)
    if all_dataframes:
        print(f"\n{'='*60}")
        print("Combining all data...")
        print(f"{'='*60}")

        # Get the first dataframe to use as the base for merging
        first_indicator_name = list(all_dataframes.keys())[0]
        combined_df = all_dataframes[first_indicator_name]

        # Define the merge keys
        merge_keys = ['REF_AREA', 'TIME_PERIOD']

        # Merge other dataframes, only including merge keys and the indicator column
        for name, df in list(all_dataframes.items())[1:]:
            cols_to_merge = merge_keys + [name]

            # Check if all columns to merge exist in the current dataframe
            cols_exist = all(col in df.columns for col in cols_to_merge)

            if cols_exist:
                combined_df = pd.merge(combined_df, df[cols_to_merge], on=merge_keys, how='outer')
            else:
                print(f"  Skipping merge for {name}: Required columns not found in the data.")


        combined_df.to_csv('combined_wdi_data.csv', index=False)
        print(f"Combined data saved to: combined_wdi_data.csv")
        print(f"Total records: {len(combined_df)}")

        # Display summary
        print("\n" + "="*60)
        print("Summary by Indicator:")
        print("="*60)
        for name, df in all_dataframes.items():
            print(f"{name}: {len(df)} records")

    print("\n✓ Data fetching completed!")


if __name__ == "__main__":
    main()


Processing: Urban population (% of total population)
Fetching data for indicator: WB_WDI_SP_URB_TOTL_IN_ZS
  Fetched 1000 records (total: 1000)
  Fetched 1000 records (total: 2000)
  Fetched 1000 records (total: 3000)
  Fetched 1000 records (total: 4000)
  Fetched 1000 records (total: 5000)
  Fetched 1000 records (total: 6000)
  Fetched 1000 records (total: 7000)
  Fetched 1000 records (total: 8000)
  Fetched 1000 records (total: 9000)
  Fetched 1000 records (total: 10000)
  Fetched 1000 records (total: 11000)
  Fetched 1000 records (total: 12000)
  Fetched 1000 records (total: 13000)
  Fetched 1000 records (total: 14000)
  Fetched 1000 records (total: 15000)
  Fetched 1000 records (total: 16000)
  Fetched 1000 records (total: 17000)
  Fetched 95 records (total: 17095)
  Total records fetched: 17095
  Saved to: WB_WDI_SP_URB_TOTL_IN_ZS.csv

Processing: Urban population
Fetching data for indicator: WB_WDI_SP_URB_TOTL
  Fetched 1000 records (total: 1000)
  Fetched 1000 records (total: 2