# Data ETL

## Imports and Setup

In [29]:
import pandas as pd
import openpyxl
from datetime import datetime
import os

# Path to your Excel file
file_path = 'data/raw/Bloomberg_Data.xlsx'

# Define which sheets use column C instead of column B
use_column_c = [
    "US_Building_Permits",
    "US _BP_Single_Housing",
    "US_Housing_Start",
    "US_New_Home_Sales",
    "US_Existing_Home _Sales",
    "US Existing_Single_Home_Sales",
    "CAD_Housing_Start"
]

# Define sheets to ignore
sheets_to_ignore = [
    "US_Population_Growth_Rate_Bloom"
]

## Function Definitions

In [30]:
def normalize_date(date):
    """Normalize date to end of month"""
    if pd.isna(date):
        return None
    
    # Convert to datetime if not already
    if not isinstance(date, datetime):
        date = pd.to_datetime(date)
    
    # Get the last day of the month
    year = date.year
    month = date.month
    
    # Create end of month date
    if month == 12:
        end_of_month = datetime(year, 12, 31)
    else:
        end_of_month = datetime(year, month + 1, 1) - pd.Timedelta(days=1)
    
    return end_of_month.date()

def extract_sheet_data(file_path, sheet_name, use_col_c):
    """Extract data from a specific sheet"""
    # Determine which column to use
    data_column = 'C' if sheet_name in use_col_c else 'B'
    
    # Read the sheet starting from row 7 (index 6 in pandas)
    df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    
    # Extract dates from column A and values from the appropriate column
    # Row 7 in Excel is index 6 in pandas (0-indexed)
    dates = df.iloc[6:, 0]  # Column A, starting from row 7
    
    if data_column == 'C':
        values = df.iloc[6:, 2]  # Column C
    else:
        values = df.iloc[6:, 1]  # Column B
    
    # Create a temporary dataframe
    temp_df = pd.DataFrame({
        'date': dates,
        'value': values
    })
    
    # Remove rows where value is NaN or empty
    temp_df = temp_df.dropna(subset=['value'])
    
    # Remove rows where date is NaN
    temp_df = temp_df.dropna(subset=['date'])
    
    # Normalize dates to end of month
    temp_df['date'] = temp_df['date'].apply(normalize_date)
    
    # Remove any rows where date normalization failed
    temp_df = temp_df.dropna(subset=['date'])
    
    # Rename value column to sheet name
    temp_df = temp_df.rename(columns={'value': sheet_name})
    
    return temp_df

## Data Extraction and Merging

In [31]:
# Load the Excel file to get all sheet names
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names

print(f"Found {len(sheet_names)} sheets in the Excel file\n")

# Extract data from all sheets
all_dataframes = []

for sheet_name in sheet_names:
    # Skip ignored sheets
    if sheet_name in sheets_to_ignore:
        print(f"Skipping: {sheet_name} (ignored)")
        continue
    
    print(f"Processing: {sheet_name}...", end=' ')
    try:
        df = extract_sheet_data(file_path, sheet_name, use_column_c)
        all_dataframes.append(df)
        print(f"✓ ({len(df)} data points)")
    except Exception as e:
        print(f"✗ Error: {e}")

# Merge all dataframes on the date column
print("\nMerging all data into master dataframe...")

master_df = all_dataframes[0]
for df in all_dataframes[1:]:
    master_df = master_df.merge(df, on='date', how='outer')

# Sort by date (most recent first)
master_df = master_df.sort_values('date', ascending=False)

# Rename date column to 'Date'
master_df = master_df.rename(columns={'date': 'Date'})

# Reset index
master_df = master_df.reset_index(drop=True)

Found 24 sheets in the Excel file

Processing: USCAD_Exchange_Rate... ✗ Error: Unknown datetime string format, unable to parse: Date, at position 0
Processing: US_CPI... ✓ (239 data points)
Processing: US_GDP... ✓ (119 data points)
Processing: US_Mortgage_Interest_5Y... ✓ (80 data points)
Processing: US_Mortgage_Interest_30Y... ✓ (80 data points)
Processing: US_Building_Permits... ✓ (79 data points)
Processing: US _BP_Single_Housing... ✓ (79 data points)
Processing: US_Housing_Start... ✓ (79 data points)
Processing: US_New_Home_Sales... ✓ (79 data points)
Processing: US_Existing_Home _Sales... ✓ (79 data points)
Processing: US Existing_Single_Home_Sales... ✓ (79 data points)
Processing: US_NAHB_Housing_Market_Index... ✓ (80 data points)
Skipping: US_Population_Growth_Rate_Bloom (ignored)
Processing: US_Population_Growth_Rate_FRED... ✓ (79 data points)
Processing: US_Households_Number... ✓ (76 data points)
Processing: CAD_CPI... ✓ (79 data points)
Processing: CAD_GDP... ✓ (119 data poin

## Data Exploration

In [32]:
print(f"\n{'='*60}")
print(f"Master DataFrame Summary:")
print(f"{'='*60}")
print(f"Total rows: {len(master_df)}")
print(f"Total columns: {len(master_df.columns)} (Date + {len(master_df.columns)-1} variables)")
print(f"Date range: {master_df['Date'].min()} to {master_df['Date'].max()}")
print(f"\nColumns: {', '.join(master_df.columns.tolist())}")



Master DataFrame Summary:
Total rows: 281
Total columns: 23 (Date + 22 variables)
Date range: 1995-12-31 to 2025-09-30

Columns: Date, US_CPI, US_GDP, US_Mortgage_Interest_5Y, US_Mortgage_Interest_30Y, US_Building_Permits, US _BP_Single_Housing, US_Housing_Start, US_New_Home_Sales, US_Existing_Home _Sales, US Existing_Single_Home_Sales, US_NAHB_Housing_Market_Index, US_Population_Growth_Rate_FRED, US_Households_Number, CAD_CPI, CAD_GDP, CAD_Building Permits, CAD_BP_Single_Housing, CAD_Housing_Start, CAD_Mortgage_Interest, CAD_Wholesaler-Distributor, CAD_Tot_Invent_Wood, CAD_Export_Price_Lumber


In [33]:
master_df.head()

Unnamed: 0,Date,US_CPI,US_GDP,US_Mortgage_Interest_5Y,US_Mortgage_Interest_30Y,US_Building_Permits,US _BP_Single_Housing,US_Housing_Start,US_New_Home_Sales,US_Existing_Home _Sales,...,US_Households_Number,CAD_CPI,CAD_GDP,CAD_Building Permits,CAD_BP_Single_Housing,CAD_Housing_Start,CAD_Mortgage_Interest,CAD_Wholesaler-Distributor,CAD_Tot_Invent_Wood,CAD_Export_Price_Lumber
0,2025-09-30,,,5.77,6.31,,,,,,...,,,,,,,6.09,,,
1,2025-08-31,0.4,,,,,,,,,...,,,,,,,,,,
2,2025-07-31,0.2,,,,,,,,,...,,,,,,,,,,
3,2025-06-30,0.3,0.9,6.16,6.77,116083.333333,72166.666667,115166.666667,56333.333333,327500.0,...,,0.2,-0.4,23651.0,3979.0,23659.5,6.09,10227.44,6682487.0,107.8
4,2025-05-31,0.1,,,,,,,,,...,,,,,,,,,,


In [34]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281 entries, 0 to 280
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Date                            281 non-null    object
 1   US_CPI                          240 non-null    object
 2   US_GDP                          120 non-null    object
 3   US_Mortgage_Interest_5Y         81 non-null     object
 4   US_Mortgage_Interest_30Y        81 non-null     object
 5   US_Building_Permits             80 non-null     object
 6   US _BP_Single_Housing           80 non-null     object
 7   US_Housing_Start                80 non-null     object
 8   US_New_Home_Sales               80 non-null     object
 9   US_Existing_Home _Sales         80 non-null     object
 10  US Existing_Single_Home_Sales   80 non-null     object
 11  US_NAHB_Housing_Market_Index    81 non-null     object
 12  US_Population_Growth_Rate_FRED  79 non-null     ob

In [35]:
master_df.describe()

Unnamed: 0,Date,US_CPI,US_GDP,US_Mortgage_Interest_5Y,US_Mortgage_Interest_30Y,US_Building_Permits,US _BP_Single_Housing,US_Housing_Start,US_New_Home_Sales,US_Existing_Home _Sales,...,US_Households_Number,CAD_CPI,CAD_GDP,CAD_Building Permits,CAD_BP_Single_Housing,CAD_Housing_Start,CAD_Mortgage_Interest,CAD_Wholesaler-Distributor,CAD_Tot_Invent_Wood,CAD_Export_Price_Lumber
count,281,240.0,120.0,81.0,81.0,80.0,80.0,80,80.0,80.0,...,76.0,80.0,120.0,58,58,80.0,81.0,80.0,80,34.0
unique,280,21.0,27.0,73.0,75.0,77.0,73.0,77,73.0,67.0,...,18.0,15.0,91.0,58,58,80.0,33.0,79.0,79,34.0
top,2005-12-31,0.2,0.6,6.11,3.88,123333.333333,61833.333333,105500,59833.333333,454166.666667,...,0.218511,0.2,0.97,23651,3979,23659.5,4.79,3730.99,4856649,107.8
freq,2,50.0,18.0,3.0,2.0,2.0,2.0,2,2.0,3.0,...,8.0,17.0,3.0,1,1,1.0,10.0,2.0,2,1.0


### Save the master dataframe to CSV or Excel

In [39]:
output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)

In [40]:
# Save to CSV
output_file = os.path.join(output_dir, 'bloomberg_master_dataframe.csv')
master_df.to_csv(output_file, index=False)
print(f"Master dataframe saved to: {output_file}")

Master dataframe saved to: data/processed/bloomberg_master_dataframe.csv


In [41]:
# Save to Excel
output_excel = os.path.join(output_dir, 'bloomberg_master_dataframe.xlsx')
master_df.to_excel(output_excel, index=False)
print(f"Master dataframe saved to: {output_excel}")

Master dataframe saved to: data/processed/bloomberg_master_dataframe.xlsx
