# Step 5: Compute Moving Targets Signal

This notebook computes the Moving Targets (MT) signal for each firm-quarter:
- MT_t = len(T_{t-4} - T_t) / len(T_{t-4})
- Also computes persistent MT and complexity measures


In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

# Load config
BASE_DIR = Path('/Users/david/Desktop/MATH-GA 2707/Moving Target')
CONFIG_DIR = BASE_DIR / 'configs'
INTERMEDIATE_DIR = BASE_DIR / 'data' / 'intermediate'

with open(CONFIG_DIR / 'base.json', 'r') as f:
    config = json.load(f)

for key in config['data']:
    config['data'][key] = Path(config['data'][key])

# Load target panel
df_panel = pd.read_parquet(config['data']['targets_panel'])

# Convert targets_norm from numpy array/list to set for set operations
# When parquet is read, lists may become numpy arrays
def to_set(x):
    """Convert targets_norm to a set, handling numpy arrays and lists"""
    if isinstance(x, np.ndarray):
        return set(x.tolist())
    elif isinstance(x, list):
        return set(x)
    elif isinstance(x, set):
        return x
    else:
        return set()

df_panel['targets_set'] = df_panel['targets_norm'].apply(to_set)

print(f"Loaded {len(df_panel)} firm-quarter observations")
print(f"Columns: {list(df_panel.columns)}")
print(f"Date range: {df_panel['fyearq'].min()} to {df_panel['fyearq'].max()}")
print(f"Firm-quarters with targets: {(df_panel['n_targets'] > 0).sum()}")


Loaded 53395 firm-quarter observations
Columns: ['firm_quarter_id', 'firm_id', 'ticker', 'call_date', 'fyearq', 'fqtr', 'targets_norm', 'n_targets', 'targets_set']
Date range: 2010 to 2024
Firm-quarters with targets: 30124


In [15]:
# Verify data structure
print("Data verification:")
print(f"  Sample targets_norm type: {type(df_panel['targets_norm'].iloc[0])}")
print(f"  Sample targets_set type: {type(df_panel['targets_set'].iloc[0])}")
print(f"  Sample targets_set size: {len(df_panel['targets_set'].iloc[0])}")

# Show a few examples with targets
df_with_targets = df_panel[df_panel['n_targets'] > 0].head(5)
if len(df_with_targets) > 0:
    print(f"\nSample firm-quarters with targets:")
    for idx, row in df_with_targets.iterrows():
        print(f"  {row['ticker']} {row['fyearq']}Q{row['fqtr']}: {len(row['targets_set'])} targets")
        print(f"    Targets: {list(row['targets_set'])[:5]}")  # Show first 5

df_panel.head()

Data verification:
  Sample targets_norm type: <class 'numpy.ndarray'>
  Sample targets_set type: <class 'set'>
  Sample targets_set size: 4

Sample firm-quarters with targets:
  AA 2010Q1: 4 targets
    Targets: ['cash on hand', 'debt to cap', 'cost of goods sold', 'sg&a percentage of sales']
  AAL 2010Q1: 1 targets
    Targets: ['traffic load factor']
  ABG 2010Q1: 1 targets
    Targets: ['earnings per share']
  ACN 2010Q1: 2 targets
    Targets: ['free cash flow', 'earnings per share']
  ADBE 2010Q1: 1 targets
    Targets: ['operating model targets']


Unnamed: 0,firm_quarter_id,firm_id,ticker,call_date,fyearq,fqtr,targets_norm,n_targets,targets_set
0,AA_2010_Q1,AA,AA,2010-04-12,2010,1,"[debt to cap, cash on hand, cost of goods sold...",4,"{cash on hand, debt to cap, cost of goods sold..."
1,AAL_2010_Q1,AAL,AAL,2010-04-21,2010,1,[traffic load factor],1,{traffic load factor}
2,AAPL_2010_Q1,AAPL,AAPL,2010-01-25,2010,1,[],0,{}
3,ABG_2010_Q1,ABG,ABG,2010-05-02,2010,1,[earnings per share],1,{earnings per share}
4,ABT_2010_Q1,ABT,ABT,2010-04-21,2010,1,[],0,{}


In [16]:
# Sort by firm and quarter for lagging
df_panel = df_panel.sort_values(['firm_id', 'fyearq', 'fqtr']).reset_index(drop=True)

# Compute Moving Targets signal
# MT_t = len(T_{t-4} - T_t) / len(T_{t-4})
# This measures the fraction of targets from 4 quarters ago that are missing in current quarter

print("Computing Moving Targets signal...")
print(f"Processing {df_panel['firm_id'].nunique()} firms...")

results = []
firms_with_valid_mt = set()
total_valid_mt = 0
total_processed = 0

unique_firms = df_panel['firm_id'].unique()
print(f"Total firms to process: {len(unique_firms)}")

for firm_idx, firm_id in enumerate(unique_firms):
    if (firm_idx + 1) % 100 == 0:
        print(f"  Processed {firm_idx + 1}/{len(unique_firms)} firms...")
    firm_data = df_panel[df_panel['firm_id'] == firm_id].copy()
    firm_data = firm_data.sort_values(['fyearq', 'fqtr']).reset_index(drop=True)
    
    for idx, row in firm_data.iterrows():
        # Get current quarter targets
        T_t = row['targets_set']
        fyearq_t = row['fyearq']
        fqtr_t = row['fqtr']
        
        # Find quarter t-4 (4 quarters ago)
        # Calculate which quarter that is
        quarters_ago = 4
        target_year = fyearq_t
        target_qtr = fqtr_t - quarters_ago
        
        # Handle year rollover
        while target_qtr <= 0:
            target_qtr += 4
            target_year -= 1
        
        # Find the t-4 observation
        lag_data = firm_data[
            (firm_data['fyearq'] == target_year) & 
            (firm_data['fqtr'] == target_qtr)
        ]
        
        if len(lag_data) > 0:
            T_t_minus_4 = lag_data.iloc[0]['targets_set']
            n_targets_lag4 = len(T_t_minus_4)
            
            # Compute missing targets
            missing = T_t_minus_4 - T_t
            
            # Compute MT
            if n_targets_lag4 > 0:
                MT = len(missing) / n_targets_lag4
            else:
                MT = np.nan
        else:
            T_t_minus_4 = set()
            n_targets_lag4 = 0
            MT = np.nan
        
        # Compute persistent targets (targets mentioned in consecutive prior years)
        # For simplicity, we'll define persistent as targets in both t-4 and t-8
        persistent_targets = set()
        if len(lag_data) > 0:
            # Check t-8 as well
            target_year_t8 = target_year
            target_qtr_t8 = target_qtr - 4
            while target_qtr_t8 <= 0:
                target_qtr_t8 += 4
                target_year_t8 -= 1
            
            lag_data_t8 = firm_data[
                (firm_data['fyearq'] == target_year_t8) & 
                (firm_data['fqtr'] == target_qtr_t8)
            ]
            
            if len(lag_data_t8) > 0:
                T_t_minus_8 = lag_data_t8.iloc[0]['targets_set']
                persistent_targets = T_t_minus_4 & T_t_minus_8
        
        # Compute persistent MT
        if len(persistent_targets) > 0:
            missing_persistent = persistent_targets - T_t
            MT_persistent = len(missing_persistent) / len(persistent_targets)
        else:
            MT_persistent = np.nan
        
        # Track valid MT values for debugging
        if not np.isnan(MT):
            total_valid_mt += 1
            firms_with_valid_mt.add(firm_id)
        
        total_processed += 1
        
        results.append({
            'firm_id': firm_id,
            'ticker': row['ticker'],
            'fyearq': fyearq_t,
            'fqtr': fqtr_t,
            'call_date': row['call_date'],
            'firm_quarter_id': row['firm_quarter_id'],
            'n_targets': len(T_t),
            'n_targets_lag4': n_targets_lag4,
            'MT': MT,
            'MT_persistent': MT_persistent,
            'n_persistent_targets': len(persistent_targets),
            'complexity': n_targets_lag4  # Simple complexity measure
        })

df_signal = pd.DataFrame(results)
print(f"\n✓ Computed MT signal for {len(df_signal)} firm-quarters")
print(f"  Valid MT observations: {total_valid_mt:,} ({100*total_valid_mt/len(df_signal):.1f}%)")
print(f"  Firms with at least one valid MT: {len(firms_with_valid_mt):,}")
if total_valid_mt > 0:
    print(f"  Average MT (valid only): {df_signal['MT'].mean():.4f}")
    print(f"  MT range: [{df_signal['MT'].min():.4f}, {df_signal['MT'].max():.4f}]")


Computing Moving Targets signal...
Processing 1472 firms...
Total firms to process: 1472
  Processed 100/1472 firms...
  Processed 200/1472 firms...
  Processed 300/1472 firms...
  Processed 400/1472 firms...
  Processed 500/1472 firms...
  Processed 600/1472 firms...
  Processed 700/1472 firms...
  Processed 800/1472 firms...
  Processed 900/1472 firms...
  Processed 1000/1472 firms...
  Processed 1100/1472 firms...
  Processed 1200/1472 firms...
  Processed 1300/1472 firms...
  Processed 1400/1472 firms...

✓ Computed MT signal for 53395 firm-quarters
  Valid MT observations: 24,133 (45.2%)
  Firms with at least one valid MT: 1,411
  Average MT (valid only): 0.8624
  MT range: [0.0000, 1.0000]


In [17]:
# Statistics
print("Moving Targets Signal Statistics:")
print(f"  Valid MT observations: {df_signal['MT'].notna().sum()}")
print(f"  Average MT: {df_signal['MT'].mean():.4f}")
print(f"  Median MT: {df_signal['MT'].median():.4f}")
print(f"  MT distribution:")
print(df_signal['MT'].describe())

print(f"\n  Valid Persistent MT observations: {df_signal['MT_persistent'].notna().sum()}")
print(f"  Average Persistent MT: {df_signal['MT_persistent'].mean():.4f}")

# Save signal
output_file = config['data']['firm_quarter_signal']
df_signal.to_parquet(output_file, index=False, engine='pyarrow')
print(f"\nSaved MT signal to: {output_file}")


Moving Targets Signal Statistics:
  Valid MT observations: 24133
  Average MT: 0.8624
  Median MT: 1.0000
  MT distribution:
count    24133.000000
mean         0.862414
std          0.283573
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: MT, dtype: float64

  Valid Persistent MT observations: 4793
  Average Persistent MT: 0.5727

Saved MT signal to: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate/firm_quarter_signal.parquet
