In [17]:
#FIND TRADABLE RELATIONSHPS
#TRADABLE PAIRS

import pandas as pd
import os
import sys

# Setup paths
sys.path.append(os.path.abspath(os.path.join('..')))
from src.statistics import get_clusters, find_cointegrated_pairs

raw_dir = '../data/raw'
processed_dir = '../data/processed'

#time filter configuration
TRAIN_START_DATE = pd.Timestamp('2021-01-01')
TRAIN_END_DATE = pd.Timestamp('2023-12-31')



all_valid_pairs = []

print("PART 1: MINING FOR PAIRS (Clustering and Cointegration)")
print(f"TIME FILTER ACTIVE: Only analyzing data from {TRAIN_START_DATE.date()} to {TRAIN_END_DATE.date()}")
print(f"(Training period: 2021-2023 | Test period: 2024-2025)\n")


for filename in os.listdir(raw_dir):
    if filename.endswith("_prices.csv"):
        sector_name = filename.replace("_prices.csv", "")
        print(f"\n--- Sector: {sector_name.upper()} ---")

        # Load Data
        try:
            price_path = os.path.join(raw_dir, filename)
            returns_path = os.path.join(processed_dir, f"{sector_name}_returns.csv")
            
            if not os.path.exists(returns_path):
                print("   Missing returns file. Skipping.")
                continue

            prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
            returns = pd.read_csv(returns_path, index_col=0, parse_dates=True)
        except Exception as e:
            print(f"   Load Error: {e}")
            continue

#apply time filter
        prices = prices.loc[TRAIN_START_DATE:TRAIN_END_DATE]
        returns = returns.loc[TRAIN_START_DATE:TRAIN_END_DATE]

        # Step 1: Cluster
        print("   Clustering...")
        found_clusters = get_clusters(returns, eps=0.25)
        if found_clusters.empty: continue

        # Step 2: Cointegration
        print("   Cointegration Test...")
        sector_pairs = find_cointegrated_pairs(prices, found_clusters)

        if not sector_pairs.empty:
            # Clean up names
            sector_pairs = sector_pairs.rename(columns={'Stock A': 'Stock1', 'Stock B': 'Stock2'})
            sector_pairs['Cluster'] = sector_name + "_" + sector_pairs['Cluster'].astype(str)
            sector_pairs['Sector'] = sector_name
            
            all_valid_pairs.append(sector_pairs)
            print(f"   -> Found {len(sector_pairs)} cointegrated pairs.")

# SAVE CHECKPOINT
if all_valid_pairs:
    master_list = pd.concat(all_valid_pairs, ignore_index=True)
    output_path = '../data/processed/01_cointegrated_pairs.csv'
    master_list.to_csv(output_path, index=False)
    print(f"\nSUCCESS! Saved checkpoint to: {output_path}")
else:
    print("\nNo pairs found.")

PART 1: MINING FOR PAIRS (Clustering and Cointegration)
TIME FILTER ACTIVE: Only analyzing data from 2021-01-01 to 2023-12-31
(Training period: 2021-2023 | Test period: 2024-2025)


--- Sector: COMMUNICATION_SERVICES ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 1 clusters...
  > Checking Cluster 0 (2 stocks)...

--- Sector: CONSUMER_DISCRETIONARY ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 8 clusters...
  > Checking Cluster 0 (2 stocks)...
  > Checking Cluster 1 (4 stocks)...
  > Checking Cluster 2 (3 stocks)...
  > Checking Cluster 3 (3 stocks)...
  > Checking Cluster 4 (2 stocks)...
  > Checking Cluster 5 (2 stocks)...
  > Checking Cluster 6 (3 stocks)...
  > Checking Cluster 7 (2 stocks)...
   -> Found 2 cointegrated pairs.

--- Sector: CONSUMER_STAPLES ---
   Clustering...
   Cointegration Test...
Testing Cointegration on 2 clusters...
  > Checking Cluster 0 (6 stocks)...
  > Checking Cluster 1 (6 stocks)...
   -> Found 8 cointeg

In [16]:
#check all GICS sectors were processed
master_list = pd.read_csv('../data/processed/01_cointegrated_pairs.csv')
print(master_list['Sector'].unique())

['consumer_discretionary' 'consumer_staples' 'financials' 'health_care'
 'industrials' 'information_technology' 'materials' 'tech' 'utilities']
