# Mortgage Forecasting - Data Wrangling
## HMDA Data Processing and Aggregation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Configuration and Setup

In [None]:
# Load configuration
with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

print("Project Configuration:")
print(f"Target Geography: {config['data']['target_geography']}")
print(f"Geography Code: {config['data']['geography_code']}")
print(f"Years: {config['data']['hmda_years']}")

## 2. HMDA Data Loading Function

In [None]:
def load_hmda_year(year):
    """Load HMDA data for a specific year"""
    file_path = Path(f"../data/raw/hmda_{year}.csv")
    
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    
    print(f"Loading HMDA {year}...")
    
    # Read with optimized dtypes
    dtype_dict = {
        'loan_amount': 'float64',
        'action_taken': 'int64',
        'state_code': 'str',
        'county_code': 'str',
        'msa_md': 'str',
        'applicant_income_000s': 'float64',
        'as_of_year': 'int64'
    }
    
    try:
        df = pd.read_csv(file_path, dtype=dtype_dict, low_memory=False)
        print(f"  Loaded {len(df):,} records")
        return df
    except Exception as e:
        print(f"  Error loading {year}: {e}")
        return None

# Load all years
all_data = []
for year in config['data']['hmda_years']:
    df_year = load_hmda_year(year)
    if df_year is not None:
        all_data.append(df_year)

if all_data:
    hmda_raw = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal combined records: {len(hmda_raw):,}")
else:
    print("No data loaded!")
    hmda_raw = pd.DataFrame()

## 3. Data Exploration and Understanding

In [None]:
if not hmda_raw.empty:
    print("Data Overview:")
    print(f"Shape: {hmda_raw.shape}")
    print("\nColumns:")
    print(hmda_raw.columns.tolist())
    
    print("\nSample data:")
    display(hmda_raw.head())
    
    print("\nData types:")
    display(hmda_raw.dtypes)

In [None]:
# Check action_taken distribution
if 'action_taken' in hmda_raw.columns:
    print("Action Taken Distribution:")
    action_counts = hmda_raw['action_taken'].value_counts().sort_index()
    
    # Common HMDA action codes
    action_map = {
        1: 'Loan originated',
        2: 'Application approved but not accepted',
        3: 'Application denied',
        4: 'Application withdrawn by applicant',
        5: 'File closed for incompleteness',
        6: 'Purchased loan',
        7: 'Preapproval request denied',
        8: 'Preapproval request approved but not accepted'
    }
    
    for code, count in action_counts.items():
        desc = action_map.get(code, 'Unknown')
        print(f"  {code}: {desc} - {count:,} records ({count/len(hmda_raw)*100:.1f}%)")

## 4. Data Filtering and Cleaning

In [None]:
def filter_originated_loans(df):
    """Filter for originated loans only (action_taken = 1)"""
    if 'action_taken' not in df.columns:
        print("Warning: 'action_taken' column not found")
        return df
    
    original_count = len(df)
    df_originated = df[df['action_taken'] == 1].copy()
    
    print(f"Filtered to originated loans: {len(df_originated):,} records ({len(df_originated)/original_count*100:.1f}% of total)")
    
    return df_originated

def filter_target_geography(df, geography_type, geography_code):
    """Filter for target geography"""
    if geography_type == 'msa':
        if 'msa_md' not in df.columns:
            print("Warning: 'msa_md' column not found")
            return df
        df_geo = df[df['msa_md'] == geography_code].copy()
        print(f"Filtered to MSA {geography_code}: {len(df_geo):,} records")
    else:  # state
        if 'state_code' not in df.columns:
            print("Warning: 'state_code' column not found")
            return df
        df_geo = df[df['state_code'] == geography_code].copy()
        print(f"Filtered to state {geography_code}: {len(df_geo):,} records")
    
    return df_geo

# Apply filters
if not hmda_raw.empty:
    print("Applying data filters...")
    
    # Filter for originated loans
    hmda_originated = filter_originated_loans(hmda_raw)
    
    # Filter for target geography
    hmda_filtered = filter_target_geography(
        hmda_originated, 
        config['data']['geography_type'], 
        config['data']['geography_code']
    )
    
    print(f"\nFinal filtered dataset: {len(hmda_filtered):,} records")

## 5. Data Quality Checks

In [None]:
if not hmda_filtered.empty:
    print("Data Quality Checks:")
    
    # Check for missing values
    print("\nMissing Values:")
    missing_data = hmda_filtered.isnull().sum()
    for col, missing_count in missing_data.items():
        if missing_count > 0:
            print(f"  {col}: {missing_count:,} missing ({missing_count/len(hmda_filtered)*100:.1f}%)")
    
    # Check loan amount distribution
    if 'loan_amount' in hmda_filtered.columns:
        print(f"\nLoan Amount Statistics:")
        print(f"  Min: ${hmda_filtered['loan_amount'].min():,.0f}")
        print(f"  Mean: ${hmda_filtered['loan_amount'].mean():,.0f}")
        print(f"  Median: ${hmda_filtered['loan_amount'].median():,.0f}")
        print(f"  Max: ${hmda_filtered['loan_amount'].max():,.0f}")
        
        # Check for zeros or negative values
        zero_or_negative = (hmda_filtered['loan_amount'] <= 0).sum()
        if zero_or_negative > 0:
            print(f"  WARNING: {zero_or_negative} records with zero or negative loan amounts")
    
    # Check temporal coverage
    if 'as_of_year' in hmda_filtered.columns:
        print(f"\nYearly Distribution:")
        year_counts = hmda_filtered['as_of_year'].value_counts().sort_index()
        for year, count in year_counts.items():
            print(f"  {year}: {count:,} records")

## 6. Time Series Aggregation

In [None]:
def create_quarterly_series(df):
    """Aggregate loan volume by quarter"""
    if df.empty:
        return pd.DataFrame()
    
    # Create quarter dates (approximate - HMDA doesn't have exact dates)
    # We'll assign each record to a quarter based on year
    df_agg = df.copy()
    
    # Create quarter mapping
    quarter_dates = []
    for year in df_agg['as_of_year'].unique():
        for quarter in [1, 2, 3, 4]:
            if quarter == 1:
                date = f"{year}-03-31"
            elif quarter == 2:
                date = f"{year}-06-30"
            elif quarter == 3:
                date = f"{year}-09-30"
            else:
                date = f"{year}-12-31"
            quarter_dates.append(date)
    
    # For simplicity, we'll assign records randomly to quarters within their year
    # In practice, you might have actual application dates
    np.random.seed(42)  # for reproducibility
    df_agg['quarter_date'] = pd.to_datetime(
        df_agg['as_of_year'].astype(str) + 
        '-Q' + 
        pd.Series(np.random.choice([1, 2, 3, 4], len(df_agg))).astype(str)
    ).dt.to_period('Q').dt.end_time
    
    # Aggregate by quarter
    quarterly = df_agg.groupby('quarter_date')['loan_amount'].agg([
        ('total_loan_volume', 'sum'),
        ('loan_count', 'count'),
        ('avg_loan_size', 'mean')
    ]).reset_index()
    
    quarterly = quarterly.rename(columns={'quarter_date': 'date'})
    quarterly = quarterly.sort_values('date')
    
    # Create complete time series
    start_date = quarterly['date'].min()
    end_date = quarterly['date'].max()
    all_quarters = pd.date_range(start=start_date, end=end_date, freq='Q')
    
    complete_series = pd.DataFrame({'date': all_quarters})
    complete_series = complete_series.merge(quarterly, on='date', how='left')
    
    # Fill missing values with 0 (assuming no loans in that quarter)
    complete_series['total_loan_volume'] = complete_series['total_loan_volume'].fillna(0)
    complete_series['loan_count'] = complete_series['loan_count'].fillna(0)
    
    return complete_series

# Create quarterly time series
if not hmda_filtered.empty:
    quarterly_series = create_quarterly_series(hmda_filtered)
    
    print("Quarterly Time Series:")
    print(f"Time range: {quarterly_series['date'].min()} to {quarterly_series['date'].max()}")
    print(f"Total quarters: {len(quarterly_series)}")
    
    display(quarterly_series.head(10))
    
    # Summary statistics
    print("\nQuarterly Volume Summary:")
    print(f"Total volume: ${quarterly_series['total_loan_volume'].sum():,.0f}")
    print(f"Average quarterly volume: ${quarterly_series['total_loan_volume'].mean():,.0f}")
    print(f"Max quarterly volume: ${quarterly_series['total_loan_volume'].max():,.0f}")
    print(f"Min quarterly volume: ${quarterly_series['total_loan_volume'].min():,.0f}")

## 7. Save Processed Data

In [None]:
# Save processed data
if not quarterly_series.empty:
    output_path = Path("../data/processed/quarterly_mortgage_volume.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    quarterly_series.to_csv(output_path, index=False)
    print(f"\nProcessed data saved to: {output_path}")
    
    # Also save the filtered HMDA data for future use
    filtered_path = Path("../data/processed/filtered_hmda_data.csv")
    hmda_filtered.to_csv(filtered_path, index=False)
    print(f"Filtered HMDA data saved to: {filtered_path}")

## 8. Quick Visualization

In [None]:
if not quarterly_series.empty:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
    
    # Plot 1: Total loan volume
    ax1.plot(quarterly_series['date'], quarterly_series['total_loan_volume'] / 1e6, 
             linewidth=2, marker='o', markersize=4)
    ax1.set_title(f'Quarterly Mortgage Origination Volume\n{config["data"]["target_geography"]}', 
                 fontsize=14, fontweight='bold')
    ax1.set_ylabel('Volume (Millions $)')
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Loan count
    ax2.plot(quarterly_series['date'], quarterly_series['loan_count'], 
             linewidth=2, marker='o', markersize=4, color='orange')
    ax2.set_title('Quarterly Loan Count', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Number of Loans')
    ax2.set_xlabel('Date')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Save the plot
    fig.savefig('../outputs/data_wrangling_timeseries.png', dpi=300, bbox_inches='tight')

## Summary

In this notebook we:
1. Loaded and combined multiple years of HMDA data
2. Filtered for originated loans in our target geography
3. Performed data quality checks
4. Aggregated to quarterly time series
5. Saved processed data for analysis
6. Created initial visualizations

The data is now ready for exploratory analysis in the next notebook.