# üìä NOTEBOOK 1: ECONOMIC DATA COLLECTION (2018-2024)
# üéØ OBJECTIVE: Collect economic indicators for mortgage modeling period

## STRATEGIC CONTEXT:
- **Period**: 2018-2024 (7 years, 28 quarters)
- **Business Rationale**: Post-GFC recovery through COVID to current market
- **Key Events Captured**: 
  - 2018-2019: Stable growth period
  - 2020-2021: COVID-19 pandemic and response
  - 2022-2023: Inflation and rate hike cycle
  - 2024: Current market conditions

This timeframe provides robust data through multiple economic regimes.

In [None]:
# üìä NOTEBOOK 1: MACROECONOMIC DATA COLLECTION
# üéØ OBJECTIVE: Systematically gather all required economic indicators with robust error handling

# THINKING PROCESS:
# 1. We need multiple economic indicators that historically influence mortgage approvals
# 2. FRED API provides reliable, standardized economic data
# 3. We'll implement fallback mechanisms for data availability issues
# 4. Memory optimization is crucial for large time series

import pandas as pd
import numpy as np
import yfinance as yf
import pandas_datareader.data as web
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Business Context: Mortgage approvals are influenced by:
# - Overall economic health (GDP)
# - Labor market conditions (Unemployment)
# - Housing market trends (Home prices)
# - Interest rates (Mortgage rates)
# - Consumer confidence (Income growth)

print("üîÑ INITIALIZING ECONOMIC DATA COLLECTION PIPELINE")
print("=" * 60)

class EconomicDataCollector:
    def __init__(self):
        self.start_date = '2000-01-01'  # 20+ years for robust time series
        self.end_date = datetime.today().strftime('%Y-%m-%d')
        self.collected_data = {}
        
    def safe_fred_fetch(self, series_id, series_name, fallback_method=None):
        """
        üîÑ STRATEGIC DATA FETCHING WITH FALLBACKS
        Thinking: FRED API can be unstable, so we need graceful degradation
        """
        try:
            print(f"üì° Attempting to fetch {series_name} ({series_id}) from FRED...")
            data = web.DataReader(series_id, 'fred', self.start_date, self.end_date)
            
            # üßê VALIDATION: Check if we got meaningful data
            if data.empty or data.isna().all().iloc[0]:
                raise ValueError(f"No data returned for {series_id}")
                
            # üìà QUALITY CHECK: Ensure sufficient historical coverage
            coverage_pct = (1 - data.isna().mean()).iloc[0]
            if coverage_pct < 0.7:
                print(f"‚ö†Ô∏è  Warning: {series_name} has only {coverage_pct:.1%} data coverage")
                
            self.collected_data[series_name] = data
            print(f"‚úÖ Successfully collected {series_name} ({len(data)} observations)")
            return data
            
        except Exception as e:
            print(f"‚ùå Failed to fetch {series_name}: {str(e)}")
            
            # üõ†Ô∏è FALLBACK STRATEGY: Try alternative data sources
            if fallback_method:
                return fallback_method()
            return None
    
    def fetch_gdp_data(self):
        """üìà GDP Growth - Primary indicator of economic health"""
        # Business Rationale: Strong GDP growth ‚Üí higher incomes ‚Üí better approval chances
        gdp = self.safe_fred_fetch('GDP', 'GDP')
        gdp_growth = self.safe_fred_fetch('A191RL1Q225SBEA', 'GDP_Growth_Quarterly')
        
        return gdp, gdp_growth
    
    def fetch_labor_market_data(self):
        """üíº Labor Market Indicators"""
        # Thinking: Unemployment directly impacts ability to repay mortgages
        indicators = {
            'UNRATE': 'Unemployment_Rate',
            'CIVPART': 'Labor_Force_Participation',
            'PAYEMS': 'Nonfarm_Payrolls'
        }
        
        labor_data = {}
        for series_id, name in indicators.items():
            data = self.safe_fred_fetch(series_id, name)
            if data is not None:
                labor_data[name] = data
                
        return labor_data
    
    def fetch_housing_market_data(self):
        """üè† Housing Market Indicators"""
        # Strategic Thinking: Home prices affect collateral value and lender confidence
        housing_indicators = {
            'CSUSHPINSA': 'Case_Shiller_Home_Price_Index',
            'MSPUS': 'Median_Sales_Price_Houses',
            'HOUST': 'Housing_Starts',
            'HSN1F': 'New_Home_Sales'
        }
        
        housing_data = {}
        for series_id, name in housing_indicators.items():
            data = self.safe_fred_fetch(series_id, name)
            if data is not None:
                housing_data[name] = data
                
        return housing_data
    
    def fetch_interest_rates(self):
        """üí∞ Interest Rate Environment"""
        # Business Context: Mortgage rates directly impact affordability and demand
        rate_indicators = {
            'MORTGAGE30US': '30Y_Fixed_Mortgage_Rate',
            'FEDFUNDS': 'Federal_Funds_Rate',
            'GS10': '10Y_Treasury_Rate'
        }
        
        rate_data = {}
        for series_id, name in rate_indicators.items():
            data = self.safe_fred_fetch(series_id, name)
            if data is not None:
                rate_data[name] = data
                
        return rate_data
    
    def fetch_income_consumer_data(self):
        """üí∏ Income and Consumer Indicators"""
        # Thinking: Personal income growth affects borrowing capacity
        consumer_indicators = {
            'PCE': 'Personal_Consumption_Expenditures',
            'DSPIC96': 'Real_Disposable_Income',
            'UMCSENT': 'Consumer_Sentiment'
        }
        
        consumer_data = {}
        for series_id, name in consumer_indicators.items():
            data = self.safe_fred_fetch(series_id, name)
            if data is not None:
                consumer_data[name] = data
                
        return consumer_data

print("üöÄ STARTING COMPREHENSIVE ECONOMIC DATA COLLECTION")
print("‚è∞ This may take 2-3 minutes due to API rate limits...")

# Initialize collector with memory optimization
collector = EconomicDataCollector()

# üéØ SYSTEMATIC DATA COLLECTION STRATEGY
print("\nüìä PHASE 1: Collecting GDP and Economic Growth Data")
gdp_data = collector.fetch_gdp_data()

print("\nüëî PHASE 2: Collecting Labor Market Indicators")
labor_data = collector.fetch_labor_market_data()

print("\nüè† PHASE 3: Collecting Housing Market Data")
housing_data = collector.fetch_housing_market_data()

print("\nüíµ PHASE 4: Collecting Interest Rate Data")
rate_data = collector.fetch_interest_rates()

print("\nüõí PHASE 5: Collecting Income and Consumer Data")
consumer_data = collector.fetch_income_consumer_data()

# üß© COMBINE ALL DATA WITH STRATEGIC ALIGNMENT
print("\nüîó MERGING ALL ECONOMIC INDICATORS INTO MASTER DATASET")

# Create base date index with business day frequency
base_dates = pd.date_range(start=collector.start_date, end=collector.end_date, freq='D')

# Initialize master dataframe with optimal memory allocation
master_data = pd.DataFrame(index=base_dates)

# Strategic merging: Handle different frequencies and align properly
all_datasets = [gdp_data, labor_data, housing_data, rate_data, consumer_data]
for dataset in all_datasets:
    if isinstance(dataset, dict):
        for name, data in dataset.items():
            if data is not None:
                # üß† THINKING: Forward fill monthly/quarterly data to daily for alignment
                # This allows flexible resampling later while preserving original values
                data_daily = data.reindex(base_dates, method='ffill')
                master_data[name] = data_daily

# üßπ MEMORY OPTIMIZATION: Convert to appropriate data types
print("üíæ OPTIMIZING MEMORY USAGE THROUGH DATA TYPE SELECTION")
for col in master_data.columns:
    if master_data[col].dtype == 'float64':
        master_data[col] = master_data[col].astype('float32')
    # Store dates efficiently
    master_data.index = pd.to_datetime(master_data.index)

# üìã COMPREHENSIVE DATA QUALITY REPORT
print("\n" + "=" * 60)
print("üìä DATA COLLECTION QUALITY REPORT")
print("=" * 60)

total_series = len(master_data.columns)
missing_data_report = []

for col in master_data.columns:
    missing_pct = master_data[col].isna().mean() * 100
    date_range = master_data[col].first_valid_index(), master_data[col].last_valid_index()
    missing_data_report.append({
        'Series': col,
        'Missing (%)': f"{missing_pct:.1f}%",
        'Start Date': date_range[0].strftime('%Y-%m') if date_range[0] else 'N/A',
        'End Date': date_range[1].strftime('%Y-%m') if date_range[1] else 'N/A'
    })

quality_df = pd.DataFrame(missing_data_report)
print(quality_df.to_string(index=False))

# üíæ STRATEGIC DATA PERSISTENCE
print("\nüíø SAVING COLLECTED DATA WITH VERSION CONTROL")
import os
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Save with timestamp for version control
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
master_data.to_parquet(f'../data/raw/economic_indicators_raw_{timestamp}.parquet')
master_data.to_parquet('../data/processed/master_economic_data.parquet')

print(f"‚úÖ SUCCESSFULLY COLLECTED {total_series} ECONOMIC INDICATORS")
print(f"üìÅ Data saved to: ../data/processed/master_economic_data.parquet")
print(f"üìÖ Coverage: {master_data.index.min().strftime('%Y-%m')} to {master_data.index.max().strftime('%Y-%m')}")
print(f"üíæ Memory usage: {master_data.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

# üéØ NEXT STEPS PREVIEW
print("\n" + "‚û°Ô∏è" * 30)
print("NEXT STEP: Data Cleaning & Transformation (Notebook 2)")
print("‚Ä¢ Handle missing values strategically")
print("‚Ä¢ Create quarterly aggregates for HMDA alignment")
print("‚Ä¢ Generate lagged features for predictive modeling")