In [None]:
# GDP Growth Analysis Project
## Day 1: Data Fetching

**Project Goal:** Analyze GDP growth across continents and countries

**Data Sources:**
- REST Countries API: Country information and continent mapping
- World Bank API: GDP data from 2000-2023

---

In [3]:
# Import libraries (these are tools we'll use)
import pandas as pd  # For working with data tables
import requests      # For getting data from websites
import json         # For working with JSON data format
import time         # For adding delays between requests
import os           # For creating folders

print("🚀 Starting GDP Growth Analysis Project!")
print("=" * 50)

# Step 0: Create data folders if they don't exist
def create_data_folders():
    """Create the data/raw and data/processed folders"""
    folders = ['../data/raw', '../data/processed']
    for folder in folders:
        if not os.path.exists(folder):
            os.makedirs(folder)
            print(f"📁 Created folder: {folder}")
        else:
            print(f"📁 Folder already exists: {folder}")

# Step 1: Get data about countries and their continents (FIXED)
def fetch_country_data():
    """
    This function gets information about all countries including which continent they're in.
    It's like asking a website: "Tell me about all countries in the world"
    FIXED: Now properly specifies which fields we want from the API
    """
    print("📍 Getting country information...")
    
    try:
        # Make a request to the REST Countries API with specific fields
        url = "https://restcountries.com/v3.1/all"
        params = {
            'fields': 'name,region,subregion,continents,cca2,cca3'
        }
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            print("✅ Successfully got country data!")
            return response.json()
        else:
            print(f"❌ Error: {response.status_code}")
            print(f"Response: {response.text}")
            return None
            
    except Exception as e:
        print(f"❌ Something went wrong: {e}")
        return None

# Step 2: Get GDP data from World Bank
def fetch_gdp_data():
    """
    This function gets GDP data for all countries from 2000 to 2023.
    GDP = Gross Domestic Product (how much money a country makes in a year)
    """
    print("💰 Getting GDP data from World Bank...")
    
    try:
        # World Bank API endpoint for GDP data
        url = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.MKTP.CD"
        
        # Parameters for our request
        params = {
            'format': 'json',           
            'date': '2000:2023',        
            'per_page': 20000           
        }
        
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            if len(data) > 1 and data[1] is not None:
                print("✅ Successfully got GDP data!")
                print(f"📊 Retrieved {len(data[1])} data points")
                return data[1]
            else:
                print("❌ No GDP data found")
                return None
        else:
            print(f"❌ Error: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"❌ Something went wrong: {e}")
        return None

# Step 3: Save data to files
def save_data_to_files(countries_data, gdp_data):
    """
    Save our fetched data to JSON files so we don't lose it!
    """
    print("\n💾 Saving data to files...")
    
    try:
        # Save countries data
        if countries_data:
            with open('../data/raw/countries_raw.json', 'w', encoding='utf-8') as f:
                json.dump(countries_data, f, indent=2, ensure_ascii=False)
            print("✅ Saved countries data to: data/raw/countries_raw.json")
        
        # Save GDP data
        if gdp_data:
            with open('../data/raw/gdp_raw.json', 'w', encoding='utf-8') as f:
                json.dump(gdp_data, f, indent=2, ensure_ascii=False)
            print("✅ Saved GDP data to: data/raw/gdp_raw.json")
            
        print("💾 All data saved successfully!")
        return True
        
    except Exception as e:
        print(f"❌ Error saving data: {e}")
        return False

# Step 4: Load saved data (for testing)
def load_saved_data():
    """
    Load data from our saved files (useful for testing)
    """
    print("\n📂 Loading saved data...")
    
    try:
        # Load countries data
        with open('../data/raw/countries_raw.json', 'r', encoding='utf-8') as f:
            countries_data = json.load(f)
        print(f"✅ Loaded {len(countries_data)} countries from file")
        
        # Load GDP data
        with open('../data/raw/gdp_raw.json', 'r', encoding='utf-8') as f:
            gdp_data = json.load(f)
        print(f"✅ Loaded {len(gdp_data)} GDP records from file")
        
        return countries_data, gdp_data
        
    except FileNotFoundError:
        print("❌ No saved data files found. Run data fetching first.")
        return None, None
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None, None

# Step 5: Simple API test function
def test_apis():
    """Simple test to make sure both APIs are working"""
    print("🧪 Testing APIs...")
    
    # Test REST Countries
    print("\n1️⃣ Testing REST Countries API...")
    try:
        url1 = "https://restcountries.com/v3.1/all"
        params1 = {'fields': 'name'}
        response1 = requests.get(url1, params=params1)
        print(f"   Status: {response1.status_code}")
        if response1.status_code == 200:
            print("   ✅ REST Countries API is working!")
    except Exception as e:
        print(f"   ❌ Connection error: {e}")
    
    # Test World Bank
    print("\n2️⃣ Testing World Bank API...")
    try:
        url2 = "https://api.worldbank.org/v2/country/US/indicator/NY.GDP.MKTP.CD"
        params2 = {'format': 'json', 'date': '2022', 'per_page': 1}
        response2 = requests.get(url2, params=params2)
        print(f"   Status: {response2.status_code}")
        if response2.status_code == 200:
            print("   ✅ World Bank API is working!")
    except Exception as e:
        print(f"   ❌ Connection error: {e}")

# MAIN EXECUTION
print("\n🔧 Setting up data folders...")
create_data_folders()

print("\n🧪 Testing APIs first...")
test_apis()

print("\n🔄 Starting data collection...")

# Get fresh data from APIs
countries_data = fetch_country_data()
time.sleep(2)  # Be nice to APIs
gdp_data = fetch_gdp_data()

# Save the data we just fetched
if countries_data or gdp_data:
    save_data_to_files(countries_data, gdp_data)

# Quick data preview
if countries_data and len(countries_data) > 0:
    print(f"\n📋 Quick preview of first 5 countries:")
    for i in range(min(5, len(countries_data))):
        country = countries_data[i]
        name = country.get('name', {}).get('common', 'Unknown')
        region = country.get('region', 'Unknown')
        continent = country.get('continents', ['Unknown'])[0] if country.get('continents') else 'Unknown'
        print(f"   {i+1}. {name} - {region} ({continent})")

if gdp_data and len(gdp_data) > 0:
    print(f"\n💰 GDP data sample:")
    records_with_data = 0
    for i, record in enumerate(gdp_data[:10]):  # Check first 10
        if record.get('value') is not None:
            country_name = record.get('country', {}).get('value', 'Unknown')
            year = record.get('date', 'Unknown')
            gdp_value = record.get('value')
            print(f"   {country_name} ({year}): ${gdp_value:,.0f}")
            records_with_data += 1
            if records_with_data >= 3:  # Show only 3 examples
                break

# Final summary
print("\n" + "="*50)
print("📊 COLLECTION STAGE COMPLETE - DATA COLLECTION SUMMARY")
print("="*50)

if countries_data:
    continents = set()
    for country in countries_data:
        if country.get('continents'):
            continents.update(country.get('continents', []))
    print(f"✅ Countries: {len(countries_data)} countries collected and saved")
    print(f"🌍 Continents: {len(continents)} ({', '.join(sorted(continents))})")

if gdp_data:
    countries_in_gdp = set(record.get('country', {}).get('value') for record in gdp_data if record.get('country'))
    years_in_gdp = set(record.get('date') for record in gdp_data if record.get('date'))
    records_with_values = sum(1 for record in gdp_data if record.get('value') is not None)
    
    print(f"✅ GDP Data: {len(gdp_data)} records collected and saved")
    print(f"📈 Countries with GDP data: {len(countries_in_gdp)}")
    print(f"📅 Years covered: {min(years_in_gdp)} to {max(years_in_gdp)}")
    print(f"💰 Records with actual values: {records_with_values}")

print(f"\n📁 Data files created:")
print(f"   - data/raw/countries_raw.json")
print(f"   - data/raw/gdp_raw.json")

print(f"💾 All data fetched and saved - ready for cleaning stage!")

# Test loading the saved data
print(f"\n🔍 Testing data loading...")
test_countries, test_gdp = load_saved_data()
if test_countries and test_gdp:
    print("✅ Data loading works perfectly!")
else:
    print("⚠️ There might be an issue with saved data.")

🚀 Starting GDP Growth Analysis Project!

🔧 Setting up data folders...
📁 Folder already exists: ../data/raw
📁 Folder already exists: ../data/processed

🧪 Testing APIs first...
🧪 Testing APIs...

1️⃣ Testing REST Countries API...
   Status: 200
   ✅ REST Countries API is working!

2️⃣ Testing World Bank API...
   Status: 200
   ✅ World Bank API is working!

🔄 Starting data collection...
📍 Getting country information...
✅ Successfully got country data!
💰 Getting GDP data from World Bank...
✅ Successfully got GDP data!
📊 Retrieved 6384 data points

💾 Saving data to files...
✅ Saved countries data to: data/raw/countries_raw.json
✅ Saved GDP data to: data/raw/gdp_raw.json
💾 All data saved successfully!

📋 Quick preview of first 5 countries:
   1. Tunisia - Africa (Africa)
   2. Andorra - Europe (Europe)
   3. Vietnam - Asia (Asia)
   4. Ecuador - Americas (South America)
   5. Puerto Rico - Americas (North America)

💰 GDP data sample:
   Africa Eastern and Southern (2023): $1,176,909,900,789