In [7]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..')))
# We ONLY import the metrics function here
from src.statistics import calculate_tradability_metrics

raw_dir = '../data/raw'
input_path = '../data/processed/01_cointegrated_pairs.csv'
output_path = '../data/processed/02_final_tradable_pairs.csv'

print("PART 2: ANALYZING TRADABILITY (Hurst, Half-Life)")

# 1. Load the Checkpoint
if not os.path.exists(input_path):
    print("Error: Run Part 1 first!")
    sys.exit()

pairs_df = pd.read_csv(input_path)
print(f"Loaded {len(pairs_df)} potential pairs to analyze.")

# 2. Group by Sector (Efficiency Trick)
# Instead of opening the price file for every single pair, 
# we open the price file once per sector and process all pairs in that sector.
final_results = []
grouped = pairs_df.groupby('Sector')

for sector_name, group in grouped:
    print(f"\n--- Analyzing {len(group)} pairs in {sector_name} ---")
    
    # Load Prices for this sector ONCE
    price_path = os.path.join(raw_dir, f"{sector_name}_prices.csv")
    try:
        prices = pd.read_csv(price_path, index_col=0, parse_dates=True)
    except:
        print(f"   Could not load prices for {sector_name}. Skipping.")
        continue

    # Iterate through pairs in this sector
    for index, row in group.iterrows():
        s1 = row['Stock1']
        s2 = row['Stock2']
        
        try:
            # CALL THE FUNCTION
            metrics = calculate_tradability_metrics(prices, s1, s2)
            
            if metrics:
                # Combine original info + new metrics
                # Turn row into a dict, add metrics, append to list
                full_row = row.to_dict()
                full_row.update(metrics)
                final_results.append(full_row)
            else:
                print(f"   Failed physics: {s1}-{s2} (Math Error)")
        
        except Exception as e:
            print(f"   CRASH on {s1}-{s2}: {e}")

# 3. Save Final Result
if final_results:
    final_df = pd.DataFrame(final_results)
    
    # Sort by quality
    final_df = final_df.sort_values(by=['P-Value', 'Hurst_Exponent'])
    
    final_df.to_csv(output_path, index=False)
    print("\n" + "="*30)
    print(f"DONE! Saved {len(final_df)} final pairs to {output_path}")
    print(final_df[['Stock1', 'Stock2', 'Half_Life', 'Hurst_Exponent']].head())
else:
    print("No tradable pairs found.")

PART 2: ANALYZING TRADABILITY (Hurst, Half-Life)
Loaded 104 potential pairs to analyze.

--- Analyzing 2 pairs in consumer_discretionary ---

--- Analyzing 8 pairs in consumer_staples ---

--- Analyzing 21 pairs in financials ---

--- Analyzing 6 pairs in health_care ---

--- Analyzing 2 pairs in industrials ---

--- Analyzing 8 pairs in information_technology ---

--- Analyzing 1 pairs in materials ---

--- Analyzing 37 pairs in tech ---

--- Analyzing 19 pairs in utilities ---

DONE! Saved 104 final pairs to ../data/processed/02_final_tradable_pairs.csv
   Stock1 Stock2  Half_Life  Hurst_Exponent
53   ADSK     IT     128.07          0.3975
51   ADSK   FICO      25.08          0.3522
58   ADSK    PTC      34.99          0.3461
83    TER    ADI      47.83          0.4538
52   ADSK    MSI      20.12          0.3280
