In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
def process_economic_data(files, rate_file):
    """
    Process economic data files and interest rate data, converting all data to daily frequency
    and then resampling to quarterly data. Fills gaps between existing data points while 
    preserving NA values before the first data point.
    
    Parameters:
    -----------
    files : list
        List of paths to economic data files
    rate_file : str
        Path to the interest rate data file
    
    Returns:
    --------
    pd.DataFrame
        Processed and merged quarterly economic data
    """
    # Dictionary for more readable column names
    column_names = {
        'GDP': 'GDP',
        'INDPRO': 'Industrial_Production',
        'RSXFS': 'Retail_Sales',
        'W790RC1Q027SBEA': 'Wage_and_Salary',
        'CPIAUCSL': 'Consumer_Price_Index',
        'M2SL': 'Money_Supply_M2',
        'PCE': 'Personal_Consumption',
        'PPIACO': 'Producer_Price_Index',
        'CES0500000003': 'Average_Hourly_Earnings',
        'JTSJOL': 'Job_Openings',
        'UNRATE': 'Unemployment_Rate',
        'COMREPUSQ159N': 'Commercial_Real_Estate_Prices',
        'MSPUS': 'Median_House_Price',
        'T10Y2Y': 'Treasury_Spread_10Y_2Y',
        'NASDAQCOM': 'Nasdaq_Composite',
        'FEDFUNDS': 'Federal_Funds_Rate'
    }
    
    def load_single_file(file_path):
        """Helper function to load and process a single data file"""
        try:
            # Extract filename without extension and path
            name = file_path.replace('\\', '/').split('/')[-1].split('.')[0]
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Ensure we have the expected columns
            if 'DATE' not in df.columns or len(df.columns) < 2:
                raise ValueError(f"File {name} missing required columns")
            
            # Convert DATE to datetime and set as index
            df['DATE'] = pd.to_datetime(df['DATE'])
            df = df.set_index('DATE')
            
            # Convert value column to numeric
            value_col = df.columns[0]  # Take first column after DATE
            df[value_col] = pd.to_numeric(df[value_col], errors='coerce')
            
            # Rename column if mapping exists
            if name in column_names:
                df.columns = [column_names[name]]
            
            return name, df
            
        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")
            return None, None
    
    # Load all files
    dfs = {}
    all_files = files + [rate_file]
    for file in all_files:
        name, df = load_single_file(file)
        if df is not None:
            dfs[name] = df
            print(f"Successfully loaded: {name} as {df.columns[0]}")
    
    if not dfs:
        raise ValueError("No valid data files were loaded")
    
    # Find global date range
    start_date = min(df.index.min() for df in dfs.values())
    end_date = max(df.index.max() for df in dfs.values())
    daily_index = pd.date_range(start=start_date, end=end_date, freq='D')
    
    def process_single_df(df):
        """Helper function to process a single dataframe to daily frequency"""
        # Reindex to daily frequency
        daily_df = df.reindex(daily_index)
        
        # Get first valid date
        first_valid_date = df.first_valid_index()
        if first_valid_date is None:
            return daily_df
        
        # Create mask for valid period
        mask = daily_df.index >= first_valid_date
        
        # Process only the valid period
        valid_period = daily_df.loc[mask].copy()
        valid_period = valid_period.fillna(method='ffill').fillna(method='bfill')
        
        # Update the daily dataframe with processed valid period
        daily_df.loc[mask] = valid_period
        
        return daily_df
    
    # Process each dataframe to daily frequency
    processed_dfs = {}
    for name, df in dfs.items():
        try:
            processed_df = process_single_df(df)
            processed_dfs[name] = processed_df
            print(f"{name} processed to daily frequency")
        except Exception as e:
            print(f"Error processing {name}: {str(e)}")
            continue
    
    # Merge all dataframes
    result = pd.concat(processed_dfs.values(), axis=1)
    
    # Generate data quality reports
    print("\nMissing data report:")
    missing_data = result.isnull().sum()
    print(missing_data[missing_data > 0])
    
    print("\nData availability percentage (after first valid data point):")
    for column in result.columns:
        first_valid = result[column].first_valid_index()
        if first_valid is not None:
            valid_period = result.loc[first_valid:, column]
            availability = (valid_period.count() / len(valid_period)) * 100
            print(f"{column}: {availability:.2f}%")
    
    # Resample to quarterly frequency using mean
    result = result.resample('Q').mean()
    
    return result

# Data Processing

In [7]:
files = [
    r"Economic Growth\GDP.csv",
    r"Economic Growth\INDPRO.csv",
    r"Economic Growth\RSXFS.csv",
    r"Economic Growth\W790RC1Q027SBEA.csv",
    r"Inflation\CPIAUCSL.csv",
    r"Inflation\M2SL.csv",
    r"Inflation\PCE.csv",
    r"Inflation\PPIACO.csv",
    r"Job Market\CES0500000003.csv",
    r"Job Market\JTSJOL.csv",
    r"Job Market\UNRATE.csv",
    r"Financial Conditions\COMREPUSQ159N.csv",
    r"Financial Conditions\MSPUS.csv",
    r"Financial Conditions\T10Y2Y.csv",
    r"Financial Conditions\NASDAQCOM.csv"
]
rate_file = r"Interest Rate\FEDFUNDS.csv"

In [8]:
# Process all data
full_data = process_economic_data(files, rate_file)

# Print information about the dataset
print("\nDataset info:")
print(f"Date range: {full_data.index.min()} to {full_data.index.max()}")
print(f"Number of rows: {len(full_data)}")
print("\nColumns in dataset:")
print(full_data.columns.tolist())

print("\nMissing values per column:")
print(full_data.isna().sum())

print("\nData availability by series:")
for col in full_data.columns:
    first_valid = full_data[col].first_valid_index()
    last_valid = full_data[col].last_valid_index()
    print(f"\n{col}:")
    print(f"First available data: {first_valid}")
    print(f"Last available data: {last_valid}")

print("\nFirst few rows of data:")
print(full_data.head())

Successfully loaded: GDP as GDP
Successfully loaded: INDPRO as Industrial_Production
Successfully loaded: RSXFS as Retail_Sales
Successfully loaded: W790RC1Q027SBEA as Wage_and_Salary
Successfully loaded: CPIAUCSL as Consumer_Price_Index
Successfully loaded: M2SL as Money_Supply_M2
Successfully loaded: PCE as Personal_Consumption
Successfully loaded: PPIACO as Producer_Price_Index
Successfully loaded: CES0500000003 as Average_Hourly_Earnings
Successfully loaded: JTSJOL as Job_Openings
Successfully loaded: UNRATE as Unemployment_Rate
Successfully loaded: COMREPUSQ159N as Commercial_Real_Estate_Prices
Successfully loaded: MSPUS as Median_House_Price
Successfully loaded: T10Y2Y as Treasury_Spread_10Y_2Y
Successfully loaded: NASDAQCOM as Nasdaq_Composite
Successfully loaded: FEDFUNDS as Federal_Funds_Rate
GDP processed to daily frequency
INDPRO processed to daily frequency
RSXFS processed to daily frequency
W790RC1Q027SBEA processed to daily frequency
CPIAUCSL processed to daily frequency


  valid_period = valid_period.fillna(method='ffill').fillna(method='bfill')
  result = result.resample('Q').mean()



Dataset info:
Date range: 1913-03-31 00:00:00 to 2024-12-31 00:00:00
Number of rows: 448

Columns in dataset:
['GDP', 'Industrial_Production', 'Retail_Sales', 'Wage_and_Salary', 'Consumer_Price_Index', 'Money_Supply_M2', 'Personal_Consumption', 'Producer_Price_Index', 'Average_Hourly_Earnings', 'Job_Openings', 'Unemployment_Rate', 'Commercial_Real_Estate_Prices', 'Median_House_Price', 'Treasury_Spread_10Y_2Y', 'Nasdaq_Composite', 'Federal_Funds_Rate']

Missing values per column:
GDP                              136
Industrial_Production             24
Retail_Sales                     316
Wage_and_Salary                  188
Consumer_Price_Index             136
Money_Supply_M2                  184
Personal_Consumption             184
Producer_Price_Index               0
Average_Hourly_Earnings          372
Job_Openings                     351
Unemployment_Rate                140
Commercial_Real_Estate_Prices    368
Median_House_Price               200
Treasury_Spread_10Y_2Y           2

In [10]:
# Save to CSV
save_dir = r"Clean Data"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "DATACLEAN.csv")
full_data.to_csv(save_path)

print(f"\nData saved to: {save_path}")
print(f"File size: {os.path.getsize(save_path) / (1024*1024):.2f} MB")
print(f"Number of rows: {len(full_data)}")
print(f"Number of columns: {len(full_data.columns)}")


Data saved to: Clean Data\DATACLEAN.csv
File size: 0.07 MB
Number of rows: 448
Number of columns: 16
