# Step 4: Build Firm-Quarter Target Sets

This notebook:
1. Aggregates targets from calls to firm-quarter level
2. Creates target sets T_t for each firm-quarter
3. Handles multiple calls per quarter (keeps main earnings call)


In [59]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from collections import Counter

# Load config
BASE_DIR = Path('/Users/david/Desktop/MATH-GA 2707/Moving Target')
CONFIG_DIR = BASE_DIR / 'configs'
INTERMEDIATE_DIR = BASE_DIR / 'data' / 'intermediate'

with open(CONFIG_DIR / 'base.json', 'r') as f:
    config = json.load(f)

for key in config['data']:
    config['data'][key] = Path(config['data'][key])

# Load extracted targets
df_targets = pd.read_parquet(config['data']['targets_extracted'])
print(f"Loaded {len(df_targets)} call-level target extractions")
print(f"Columns: {list(df_targets.columns)}")


Loaded 53395 call-level target extractions
Columns: ['call_id', 'ticker', 'firm_id', 'call_date', 'fyearq', 'fqtr', 'firm_quarter_id', 'targets_norm', 'targets_raw', 'n_targets']


In [None]:
# Combine targets_norm by firm-quarter and sort by time

def combine_targets_for_group(group):
    """Combine all targets from calls in a group into one list"""
    all_targets = []
    for targets_list in group['targets_norm']:
        all_targets.extend(targets_list)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_targets = []
    for target in all_targets:
        if target not in seen:
            seen.add(target)
            unique_targets.append(target)
    
    return unique_targets

# Aggregate by firm_quarter_id and combine targets_norm
print("Combining targets_norm by firm-quarter...")
df_panel = df_targets.groupby('firm_quarter_id').agg({
    'firm_id': 'first',
    'ticker': 'first',
    'call_date': 'max',  # Use latest call date for the quarter
    'fyearq': 'first',
    'fqtr': 'first',
}).reset_index()

# Combine targets_norm separately
combined_targets = df_targets.groupby('firm_quarter_id').apply(combine_targets_for_group)
df_panel['targets_norm'] = combined_targets.values

# Count targets
df_panel['n_targets'] = df_panel['targets_norm'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Sort by time: first by fiscal year, then by fiscal quarter
df_panel = df_panel.sort_values(['fyearq', 'fqtr', 'ticker']).reset_index(drop=True)

print(f"Combined {len(df_targets)} call-level records into {len(df_panel)} firm-quarter records")
print(f"Firm-quarters with targets: {(df_panel['n_targets'] > 0).sum()}")
print(f"\nFirst few rows (sorted by time):")
print(df_panel[['ticker', 'fyearq', 'fqtr', 'call_date', 'n_targets']].head(10))

df_panel

Combining targets_norm by firm-quarter...


Combined 53395 call-level records into 53395 firm-quarter records
Firm-quarters with targets: 30124

First few rows (sorted by time):
  ticker  fyearq  fqtr  call_date  n_targets
0     AA    2010     1 2010-04-12          4
1    AAL    2010     1 2010-04-21          1
2   AAPL    2010     1 2010-01-25          0
3    ABG    2010     1 2010-05-02          1
4    ABT    2010     1 2010-04-21          0
5   ACCO    2010     1 2010-04-28          0
6    ACN    2010     1 2009-12-17          2
7   ADBE    2010     1 2010-03-24          1
8    AEP    2010     1 2010-04-29          1
9    AFL    2010     1 2010-04-29          0


  combined_targets = df_targets.groupby('firm_quarter_id').apply(combine_targets_for_group)


Unnamed: 0,firm_quarter_id,firm_id,ticker,call_date,fyearq,fqtr,targets_norm,n_targets
0,AA_2010_Q1,AA,AA,2010-04-12,2010,1,"[debt to cap, cash on hand, cost of goods sold...",4
1,AAL_2010_Q1,AAL,AAL,2010-04-21,2010,1,[traffic load factor],1
2,AAPL_2010_Q1,AAPL,AAPL,2010-01-25,2010,1,[],0
3,ABG_2010_Q1,ABG,ABG,2010-05-02,2010,1,[earnings per share],1
4,ABT_2010_Q1,ABT,ABT,2010-04-21,2010,1,[],0
...,...,...,...,...,...,...,...,...
53390,MUSA_2024_Q4,MUSA,MUSA,2025-02-06,2024,4,[],0
53391,MVIS_2024_Q4,MVIS,MVIS,2025-03-26,2024,4,"[custom development proposals, automotive oem ...",4
53392,MVST_2024_Q4,MVST,MVST,2025-03-31,2024,4,[],0
53393,MXCT_2024_Q4,MXCT,MXCT,2025-03-11,2024,4,[core revenue growth],1


In [61]:
# Save the panel data to disk
print("="*80)
print("Saving results...")
print("="*80)

# Statistics before saving
print(f"\nPanel Statistics:")
print(f"  Total firm-quarters: {len(df_panel):,}")
print(f"  Firm-quarters with targets: {(df_panel['n_targets'] > 0).sum():,}")
print(f"  Firm-quarters with zero targets: {(df_panel['n_targets'] == 0).sum():,}")
print(f"  Average targets per firm-quarter: {df_panel['n_targets'].mean():.2f}")
print(f"  Median targets per firm-quarter: {df_panel['n_targets'].median():.0f}")
print(f"  Max targets in a firm-quarter: {df_panel['n_targets'].max()}")

# Time range
print(f"\nTime Range:")
print(f"  Earliest: {df_panel['fyearq'].min()} Q{df_panel[df_panel['fyearq']==df_panel['fyearq'].min()]['fqtr'].min()}")
print(f"  Latest: {df_panel['fyearq'].max()} Q{df_panel[df_panel['fyearq']==df_panel['fyearq'].max()]['fqtr'].max()}")

# Save to parquet
output_file = config['data']['targets_panel']
print(f"\nSaving to: {output_file}")

# Ensure targets_norm is a list (not set) for parquet storage
df_panel_save = df_panel.copy()
df_panel_save['targets_norm'] = df_panel_save['targets_norm'].apply(lambda x: list(x) if isinstance(x, set) else (x if isinstance(x, list) else []))

df_panel_save.to_parquet(output_file, index=False, engine='pyarrow')
print(f"✓ Saved {len(df_panel_save):,} firm-quarter target sets")

# Verify the save
print(f"\nVerifying saved file...")
df_verify = pd.read_parquet(output_file)
print(f"✓ Verified: {len(df_verify):,} rows loaded from saved file")
print(f"  Columns: {list(df_verify.columns)}")
print(f"  Sample targets_norm type: {type(df_verify['targets_norm'].iloc[0])}")


Saving results...

Panel Statistics:
  Total firm-quarters: 53,395
  Firm-quarters with targets: 30,124
  Firm-quarters with zero targets: 23,271
  Average targets per firm-quarter: 1.31
  Median targets per firm-quarter: 1
  Max targets in a firm-quarter: 14

Time Range:
  Earliest: 2010 Q1
  Latest: 2024 Q4

Saving to: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate/targets_panel.parquet
✓ Saved 53,395 firm-quarter target sets

Verifying saved file...
✓ Verified: 53,395 rows loaded from saved file
  Columns: ['firm_quarter_id', 'firm_id', 'ticker', 'call_date', 'fyearq', 'fqtr', 'targets_norm', 'n_targets']
  Sample targets_norm type: <class 'numpy.ndarray'>
