In [12]:

import pandas as pd
import os
import sys

# Setup paths
sys.path.append(os.path.abspath(os.path.join('..')))
from src.statistics import get_clusters, find_cointegrated_pairs

raw_dir = '../data/raw'
processed_dir = '../data/processed'

all_valid_pairs = []

print("PART 1: MINING FOR PAIRS (Clustering + Cointegration)")

for filename in os.listdir(raw_dir):
    if filename.endswith("_prices.csv"):
        sector_name = filename.replace("_prices.csv", "")
        print(f"\n--- Sector: {sector_name.upper()} ---")

        # Load Data
        try:
            price_path = os.path.join(raw_dir, filename)
            returns_path = os.path.join(processed_dir, f"{sector_name}_returns.csv")
            
            if not os.path.exists(returns_path):
                print("   Missing returns file. Skipping.")
                continue

            prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
            returns = pd.read_csv(returns_path, index_col=0, parse_dates=True)
        except Exception as e:
            print(f"   Load Error: {e}")
            continue

        # Step 1: Cluster
        print("   Clustering...")
        found_clusters = get_clusters(returns, eps=0.25)
        if found_clusters.empty: continue

        # Step 2: Cointegration
        print("   Cointegration Test...")
        sector_pairs = find_cointegrated_pairs(prices, found_clusters)

        if not sector_pairs.empty:
            # Clean up names
            sector_pairs = sector_pairs.rename(columns={'Stock A': 'Stock1', 'Stock B': 'Stock2'})
            sector_pairs['Cluster'] = sector_name + "_" + sector_pairs['Cluster'].astype(str)
            sector_pairs['Sector'] = sector_name
            
            all_valid_pairs.append(sector_pairs)
            print(f"   -> Found {len(sector_pairs)} cointegrated pairs.")

# SAVE CHECKPOINT
if all_valid_pairs:
    master_list = pd.concat(all_valid_pairs, ignore_index=True)
    output_path = '../data/processed/01_cointegrated_pairs.csv'
    master_list.to_csv(output_path, index=False)
    print(f"\nSUCCESS! Saved checkpoint to: {output_path}")
else:
    print("\nNo pairs found.")

PART 1: MINING FOR PAIRS (Clustering + Cointegration)

--- Sector: COMMUNICATION_SERVICES ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 2 clusters...
  > Checking Cluster 0 (3 stocks)...
  > Checking Cluster 1 (2 stocks)...
   -> Found 1 cointegrated pairs.

--- Sector: CONSUMER_DISCRETIONARY ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 9 clusters...
  > Checking Cluster 0 (2 stocks)...
  > Checking Cluster 1 (3 stocks)...
  > Checking Cluster 2 (2 stocks)...
  > Checking Cluster 3 (8 stocks)...
  > Checking Cluster 4 (2 stocks)...
  > Checking Cluster 5 (2 stocks)...
  > Checking Cluster 6 (2 stocks)...
  > Checking Cluster 7 (2 stocks)...
  > Checking Cluster 8 (2 stocks)...
   -> Found 15 cointegrated pairs.

--- Sector: CONSUMER_STAPLES ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 3 clusters...
  > Checking Cluster 0 (6 stocks)...
  > Checking Cluster 1 (7 stocks)...
  > Checking Cluster 2 (2 stocks)...
  