# 01B — Index-Macro Alignment

**Purpose**: Align monthly macro data with monthly sector returns, handle missing months correctly

**Inputs**:
- `india_indices_monthly.parquet`
- `rbi_macro_all_long.parquet`
- `series_registry.parquet`

**Output**: `macro_sector_monthly_matrix.parquet`

---

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

PROCESSED_PATH = Path('../data_processed')

# Load data
indices_monthly = pd.read_parquet(PROCESSED_PATH / 'india_indices_monthly.parquet')
rbi_macro_long = pd.read_parquet(PROCESSED_PATH / 'rbi_macro_all_long.parquet')
series_registry = pd.read_parquet(PROCESSED_PATH / 'series_registry.parquet')

print(f"Indices: {indices_monthly.shape}")
print(f"RBI Macro: {rbi_macro_long.shape}")
print(f"Series Registry: {len(series_registry)}")

Indices: (140, 21)
RBI Macro: (41884, 5)
Series Registry: 80


## 1. Calculate Monthly Returns for Indices

In [2]:
# Calculate monthly returns
index_returns = indices_monthly.pct_change() * 100
index_returns = index_returns.dropna(how='all')

print(f"Index returns shape: {index_returns.shape}")
print(f"Date range: {index_returns.index.min().date()} to {index_returns.index.max().date()}")
index_returns.tail()

Index returns shape: (139, 21)
Date range: 2015-02-28 to 2026-12-31


Unnamed: 0_level_0,NIFTY_100,NIFTY_200,NIFTY_500,NIFTY_50,NIFTY_ALPHA_50,NIFTY_AUTO,NIFTY_BANK,NIFTY_COMMODITIES,NIFTY_CONSR_DURBL,NIFTY_CONSUMPTION,...,NIFTY_ENERGY,NIFTY_FIN_SERVICE,NIFTY_FMCG,NIFTY_GS_COMPSITE,NIFTY_HEALTHCARE,NIFTY_IND_DIGITAL,NIFTY_INDIA_MFG,NIFTY_INFRA,NIFTY_IT,NIFTY_LARGEMID250
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2026-06-30,-0.183929,-0.185414,-0.199306,-0.272759,0.271839,0.097598,0.123576,-0.375894,0.117515,-0.069607,...,-0.557281,0.336248,0.226137,0.095837,1.847878,-0.297472,0.005424,-1.182833,0.546052,-0.191611
2026-07-31,-0.100527,0.000684,0.025406,-0.144965,0.207521,-0.79506,-0.212165,-0.314427,1.694873,-0.201635,...,-0.238918,-0.328322,-0.127236,0.003279,0.435779,1.01687,-0.034459,-0.503558,1.873219,0.163712
2026-08-31,-1.191854,-1.3328,-1.401114,-1.009535,-1.935006,-0.966981,-0.507327,-2.403901,-0.783427,-0.761639,...,-2.890796,-0.648935,-0.904123,-0.099677,-1.377586,-1.556807,-1.729624,-1.880647,-1.986363,-1.540081
2026-09-30,-0.79225,-0.791205,-0.898434,-0.747966,-1.484933,-1.151858,-0.728724,-0.743331,-1.142685,-1.059507,...,-0.922411,-1.049775,-1.079039,0.000656,-0.921765,-0.366343,-0.721371,-0.853107,0.281907,-0.83508
2026-12-31,0.39786,0.315581,0.183659,0.416418,-0.233415,-0.270226,0.335772,1.324316,-0.257826,0.441893,...,0.186723,0.498136,0.586119,0.209069,-0.375544,-0.228129,0.264342,0.317092,-0.102427,0.114108


## 2. Convert RBI Macro to Wide Format (Monthly)

In [3]:
# Get usable RBI series from registry
usable_rbi_series = series_registry[series_registry['source'] == 'RBI']['series_name'].tolist()
print(f"Usable RBI series: {len(usable_rbi_series)}")

# Filter macro data
macro_filtered = rbi_macro_long[
    rbi_macro_long['series_name'].isin(usable_rbi_series)
].copy()

print(f"Filtered macro observations: {len(macro_filtered)}")

Usable RBI series: 66
Filtered macro observations: 30009


In [4]:
def resample_to_monthly(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert long-format macro data to wide-format monthly.
    For each series, take the last value of each month.
    """
    # Ensure Date is datetime
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    
    # For each series, resample to month-end
    monthly_dfs = []
    
    for series_name in df['series_name'].unique():
        series_data = df[df['series_name'] == series_name][['Date', 'value']].copy()
        series_data = series_data.set_index('Date').sort_index()
        
        # Resample to month-end, taking last value
        monthly = series_data.resample('ME').last()
        monthly.columns = [series_name]
        monthly_dfs.append(monthly)
    
    # Combine all series
    if monthly_dfs:
        combined = pd.concat(monthly_dfs, axis=1)
        return combined
    return pd.DataFrame()

# Convert to monthly wide format
macro_monthly_wide = resample_to_monthly(macro_filtered)
print(f"Macro monthly wide shape: {macro_monthly_wide.shape}")
print(f"Date range: {macro_monthly_wide.index.min().date()} to {macro_monthly_wide.index.max().date()}")

Macro monthly wide shape: (899, 66)
Date range: 1951-03-31 to 2026-01-31


## 3. Apply Transformations (YoY, MoM, etc.)

In [5]:
def apply_transformations(df: pd.DataFrame, registry: pd.DataFrame) -> pd.DataFrame:
    """
    Apply transformations based on series registry.
    """
    transformed = pd.DataFrame(index=df.index)
    
    for col in df.columns:
        # Look up transformation in registry
        series_info = registry[registry['series_name'] == col]
        
        if len(series_info) > 0:
            transform = series_info['transformation'].iloc[0]
        else:
            transform = 'Level'
        
        series = df[col]
        
        if 'YoY' in str(transform):
            # Year-over-year % change
            transformed[f"{col}_YoY"] = series.pct_change(12) * 100
        elif 'MoM' in str(transform):
            # Month-over-month % change
            transformed[f"{col}_MoM"] = series.pct_change(1) * 100
        elif 'Δ' in str(transform) or 'delta' in str(transform).lower():
            # First difference (for rates in bps)
            transformed[f"{col}_Level"] = series
            transformed[f"{col}_Delta"] = series.diff() * 100  # Convert to bps
        else:
            # Keep as level
            transformed[f"{col}_Level"] = series
    
    return transformed

# Apply transformations
macro_transformed = apply_transformations(macro_monthly_wide, series_registry)
print(f"Transformed macro shape: {macro_transformed.shape}")
print(f"Sample columns: {list(macro_transformed.columns[:5])}")

Transformed macro shape: (899, 74)
Sample columns: ['NSE S&P CNX NIFTY_Level', 'BSE BANKEX_Level', 'REPO RATE (OVERNIGHT)_Level', 'REPO RATE (OVERNIGHT)_Delta', 'REVERSE REPO RATE (OVERNIGHT)_Level']


## 4. Align Date Indices

In [6]:
# Find overlapping date range
common_start = max(index_returns.index.min(), macro_transformed.index.min())
common_end = min(index_returns.index.max(), macro_transformed.index.max())

print(f"Overlapping period: {common_start.date()} to {common_end.date()}")

# Filter both to common range
index_aligned = index_returns[(index_returns.index >= common_start) & (index_returns.index <= common_end)]
macro_aligned = macro_transformed[(macro_transformed.index >= common_start) & (macro_transformed.index <= common_end)]

print(f"Index aligned: {index_aligned.shape}")
print(f"Macro aligned: {macro_aligned.shape}")

Overlapping period: 2015-02-28 to 2026-01-31
Index aligned: (132, 21)
Macro aligned: (132, 74)


## 5. Handle Missing Months

In [7]:
# Create complete monthly date range
full_date_range = pd.date_range(start=common_start, end=common_end, freq='ME')
print(f"Full date range: {len(full_date_range)} months")

# Reindex both DataFrames to complete range
index_complete = index_aligned.reindex(full_date_range)
macro_complete = macro_aligned.reindex(full_date_range)

# Check missing data
index_missing = (index_complete.isna().sum() / len(index_complete) * 100).round(1)
macro_missing = (macro_complete.isna().sum() / len(macro_complete) * 100).round(1)

print(f"\nIndex missing (avg): {index_missing.mean():.1f}%")
print(f"Macro missing (avg): {macro_missing.mean():.1f}%")

Full date range: 132 months

Index missing (avg): 0.3%
Macro missing (avg): 10.4%


In [8]:
# Forward-fill macro data (macro conditions persist until new data)
macro_complete = macro_complete.ffill(limit=3)  # Fill up to 3 months

# For indices, we don't forward-fill (missing = no trading)
# But we can drop columns with too much missing data (>30%)
usable_indices = index_missing[index_missing < 30].index.tolist()
usable_macro = macro_missing[macro_missing < 30].index.tolist()

print(f"Usable index columns: {len(usable_indices)} / {len(index_missing)}")
print(f"Usable macro columns: {len(usable_macro)} / {len(macro_missing)}")

Usable index columns: 21 / 21
Usable macro columns: 64 / 74


## 6. Create Combined Matrix

In [9]:
# Combine index returns and macro variables
# Add prefix to distinguish
index_prefixed = index_complete[usable_indices].copy()
index_prefixed.columns = [f"IDX_{c}" for c in index_prefixed.columns]

macro_prefixed = macro_complete[usable_macro].copy()
macro_prefixed.columns = [f"MACRO_{c}" for c in macro_prefixed.columns]

# Merge
combined_matrix = pd.concat([index_prefixed, macro_prefixed], axis=1)

print(f"Combined matrix shape: {combined_matrix.shape}")
print(f"Index columns: {len([c for c in combined_matrix.columns if c.startswith('IDX_')])}")
print(f"Macro columns: {len([c for c in combined_matrix.columns if c.startswith('MACRO_')])}")

Combined matrix shape: (132, 85)
Index columns: 21
Macro columns: 64


## 7. Add Global Variables (Monthly)

In [10]:
try:
    # Load global data
    global_rates = pd.read_parquet(PROCESSED_PATH / 'global_rates_monthly.parquet')
    
    # Align to common range
    global_rates_aligned = global_rates.reindex(full_date_range)
    
    # Calculate changes
    global_rates_delta = global_rates_aligned.diff()
    global_rates_delta.columns = [f"GLOBAL_{c}_Delta" for c in global_rates_delta.columns]
    
    global_rates_aligned.columns = [f"GLOBAL_{c}" for c in global_rates_aligned.columns]
    
    # Add to combined matrix
    combined_matrix = pd.concat([combined_matrix, global_rates_aligned, global_rates_delta], axis=1)
    
    print(f"Added global rates. New shape: {combined_matrix.shape}")
except Exception as e:
    print(f"Note: Global data not yet available ({e})")

Added global rates. New shape: (132, 93)


## 8. Data Quality Summary

In [11]:
# Final data quality check
quality_summary = pd.DataFrame({
    'Column': combined_matrix.columns,
    'Type': ['Index' if c.startswith('IDX_') else 'Macro' if c.startswith('MACRO_') else 'Global' 
             for c in combined_matrix.columns],
    'Non-null': combined_matrix.notna().sum(),
    'Missing%': (combined_matrix.isna().sum() / len(combined_matrix) * 100).round(1)
})

print("Quality by type:")
quality_summary.groupby('Type').agg(
    columns=('Column', 'count'),
    avg_missing=('Missing%', 'mean')
).round(1)

Quality by type:


Unnamed: 0_level_0,columns,avg_missing
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Global,8,0.4
Index,21,0.3
Macro,64,6.3


## 9. Export Combined Matrix

In [12]:
# Save combined matrix
output_path = PROCESSED_PATH / 'macro_sector_monthly_matrix.parquet'
combined_matrix.to_parquet(output_path)

print(f"✓ Saved: {output_path}")
print(f"  Shape: {combined_matrix.shape}")
print(f"  Date range: {combined_matrix.index.min().date()} to {combined_matrix.index.max().date()}")
print(f"  Size: {output_path.stat().st_size / 1024:.1f} KB")

✓ Saved: ..\data_processed\macro_sector_monthly_matrix.parquet


  Shape: (132, 93)
  Date range: 2015-02-28 to 2026-01-31
  Size: 143.2 KB


## 10. Validation Complete ✓

**Output produced:**
- `macro_sector_monthly_matrix.parquet` — Aligned monthly dataset

**Contents:**
- `IDX_*` columns: Monthly sector index returns
- `MACRO_*` columns: Monthly macro variables (transformed)
- `GLOBAL_*` columns: Global rates and changes

**LAYER 1 FREQUENCY & ALIGNMENT COMPLETE**

**Next notebook:** `02A_macro_to_sector_relationship_map.ipynb`