# Data ETL

## Imports and Setup

In [None]:
import pandas as pd
import openpyxl
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import os

# Path to your Excel file
file_path = 'data/raw/Bloomberg_Data.xlsx'

# Define which sheets use column C instead of column B
use_column_c = [
    "US_Building_Permits",
    "US _BP_Single_Housing",
    "US_Housing_Start",
    "US_New_Home_Sales",
    "US_Existing_Home _Sales",
    "US Existing_Single_Home_Sales",
    "CAD_Housing_Start"
]

# Define sheets to ignore
sheets_to_ignore = [
    "US_Population_Growth_Rate_Bloom"
]

## Function Definitions

In [None]:
def normalize_date(date):
    """Normalize date to end of month"""
    if pd.isna(date):
        return None
    
    # Convert to datetime if not already
    if not isinstance(date, datetime):
        date = pd.to_datetime(date)
    
    # Get the last day of the month
    year = date.year
    month = date.month
    
    # Create end of month date
    if month == 12:
        end_of_month = datetime(year, 12, 31)
    else:
        end_of_month = datetime(year, month + 1, 1) - pd.Timedelta(days=1)
    
    return end_of_month.date()

def extract_sheet_data(file_path, sheet_name, use_col_c):
    """Extract data from a specific sheet"""
    # Determine which column to use
    data_column = 'C' if sheet_name in use_col_c else 'B'
    
    # Read the sheet starting from row 7 (index 6 in pandas)
    df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    
    # Extract dates from column A and values from the appropriate column
    # Row 7 in Excel is index 6 in pandas (0-indexed)
    dates = df.iloc[6:, 0]  # Column A, starting from row 7
    
    if data_column == 'C':
        values = df.iloc[6:, 2]  # Column C
    else:
        values = df.iloc[6:, 1]  # Column B
    
    # Create a temporary dataframe
    temp_df = pd.DataFrame({
        'date': dates,
        'value': values
    })
    
    # Remove rows where value is NaN or empty
    temp_df = temp_df.dropna(subset=['value'])
    
    # Remove rows where date is NaN
    temp_df = temp_df.dropna(subset=['date'])
    
    # Normalize dates to end of month
    temp_df['date'] = temp_df['date'].apply(normalize_date)
    
    # Remove any rows where date normalization failed
    temp_df = temp_df.dropna(subset=['date'])
    
    # Rename value column to sheet name
    temp_df = temp_df.rename(columns={'value': sheet_name})
    
    return temp_df

## Data Extraction and Merging

In [None]:
# Load the Excel file to get all sheet names
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names

print(f"Found {len(sheet_names)} sheets in the Excel file\n")

# Extract data from all sheets
all_dataframes = []

for sheet_name in sheet_names:
    # Skip ignored sheets
    if sheet_name in sheets_to_ignore:
        print(f"Skipping: {sheet_name} (ignored)")
        continue
    
    print(f"Processing: {sheet_name}...", end=' ')
    try:
        df = extract_sheet_data(file_path, sheet_name, use_column_c)
        all_dataframes.append(df)
        print(f"✓ ({len(df)} data points)")
    except Exception as e:
        print(f"✗ Error: {e}")

# Merge all dataframes on the date column
print("\nMerging all data into master dataframe...")

master_df = all_dataframes[0]
for df in all_dataframes[1:]:
    master_df = master_df.merge(df, on='date', how='outer')

# Sort by date (most recent first)
master_df = master_df.sort_values('date', ascending=False)

# Rename date column to 'Date'
master_df = master_df.rename(columns={'date': 'Date'})

# Reset index
master_df = master_df.reset_index(drop=True)

## Data Exploration

In [None]:
print(f"\n{'='*60}")
print(f"Master DataFrame Summary:")
print(f"{'='*60}")
print(f"Total rows: {len(master_df)}")
print(f"Total columns: {len(master_df.columns)} (Date + {len(master_df.columns)-1} variables)")
print(f"Date range: {master_df['Date'].min()} to {master_df['Date'].max()}")
print(f"\nColumns: {', '.join(master_df.columns.tolist())}")


In [None]:
master_df.head()

In [None]:
master_df.info()

In [None]:
master_df.describe()

### Save the master dataframe to CSV or Excel

In [None]:
output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Save to CSV
output_file = os.path.join(output_dir, 'bloomberg_master_dataframe.csv')
master_df.to_csv(output_file, index=False)
print(f"Master dataframe saved to: {output_file}")

In [None]:
# Save to Excel
output_excel = os.path.join(output_dir, 'bloomberg_master_dataframe.xlsx')
master_df.to_excel(output_excel, index=False)
print(f"Master dataframe saved to: {output_excel}")

### Feature Importance using RandomForest (for regression)

In [None]:
# Choose a target variable (replace with your actual target)
target_col = master_df.columns[1]  # Example: first variable after 'Date'
features = master_df.drop(columns=['Date', target_col])
target = master_df[target_col]

# Impute missing values for simplicity
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(features)
y = target.fillna(target.mean())

# Fit RandomForest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Feature importance plot
importances = rf.feature_importances_
feat_names = features.columns
indices = importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feat_names[indices])
plt.title(f'Feature Importance for predicting {target_col}')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()