# Step 3: Target Extraction (GPT API)

This notebook extracts "targets" from earnings call transcripts using GPT API:
1. Uses OpenAI GPT API (gpt-4o-mini by default) for target extraction
2. Extracts financial and strategic targets (revenue, margins, growth metrics, etc.)
3. Normalizes target strings to canonical forms (lowercase, consistent naming)
4. Uses parallel processing (20 workers) for fast extraction
5. Implements checkpointing to resume interrupted extractions
6. Outputs normalized target sets per call with diagnostics



In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
import time
from collections import Counter
from tqdm import tqdm
from openai import OpenAI
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# Load config
BASE_DIR = Path('/Users/david/Desktop/MATH-GA 2707/Moving Target')
CONFIG_DIR = BASE_DIR / 'configs'
INTERMEDIATE_DIR = BASE_DIR / 'data' / 'intermediate'

with open(CONFIG_DIR / 'base.json', 'r') as f:
    config = json.load(f)

for key in config['data']:
    config['data'][key] = Path(config['data'][key])

# Initialize OpenAI client
# Get API key from config, environment variable, or prompt user
api_key = os.getenv('GPT_API')
if not api_key:
    # Try to get from config if it exists
    api_key = config.get('api', {}).get('openai_api_key', None)
    if not api_key:
        api_key = input("Enter your OpenAI API key (or set OPENAI_API_KEY environment variable): ").strip()

if not api_key:
    raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or provide it in config.")

client = OpenAI(api_key=api_key)
print("✓ OpenAI client initialized")

# Configuration for GPT extraction
GPT_MODEL = config.get('nlp', {}).get('gpt_model', 'gpt-4o-mini')  # Use cheaper model by default
MAX_TOKENS = 1000  # For target extraction response
# Note: gpt-5 models only support temperature=1 (default), older models can use 0.0
TEMPERATURE = 0.0  # For older models (gpt-4, etc.), will be adjusted for gpt-5 models
print(f"Using model: {GPT_MODEL}")
print(f"Max completion tokens: {MAX_TOKENS}")


✓ OpenAI client initialized
Using model: gpt-4o-mini
Max completion tokens: 1000


In [None]:
# Load transcripts
# Try using Polars for faster loading if available
try:
    import polars as pl
    USE_POLARS = True
    print("Using Polars for faster loading...")
    df_pl = pl.read_parquet(config['data']['transcripts_clean'])
    df = df_pl.to_pandas()
    print(f"✓ Loaded {len(df):,} transcripts with Polars")
except ImportError:
    USE_POLARS = False
    df = pd.read_parquet(config['data']['transcripts_clean'])
    print(f"✓ Loaded {len(df):,} transcripts with pandas")

# Focus on presentation text (as per paper)
df['text_to_process'] = df['text_presentation'].fillna(df['text_full'])
df = df[df['text_to_process'].str.len() > 100].copy()  # Filter very short texts
print(f"After filtering: {len(df):,} transcripts with sufficient text")

# Truncate very long texts to save tokens (reduce cost)
MAX_TEXT_LENGTH = 3000  # Characters - reduced to save API costs
df['text_to_process'] = df['text_to_process'].str[:MAX_TEXT_LENGTH]
print(f"Texts truncated to max {MAX_TEXT_LENGTH} characters for API efficiency")


In [56]:
# System prompt for consistent target extraction
SYSTEM_PROMPT = """You are a financial analyst extracting "targets" from earnings call transcripts.

A "target" is a specific metric, goal, or objective that management discusses. Examples:
- Financial metrics: revenue, sales, earnings, margin, EBITDA, EPS, ROE
- Product targets: iPhone sales, cloud revenue, subscription growth
- Strategic goals: market share, customer acquisition, geographic expansion
- Operational metrics: same-store sales, organic growth, free cash flow

Extract ALL targets mentioned in the transcript as much as possible. For each target:
1. Use the CANONICAL form (e.g., "revenue" not "revenues", "gross margin" not "gross margins")
2. Use lowercase
3. Remove articles and unnecessary words
4. Be specific but concise (around 5 words)

Return ONLY a JSON object with a "targets" array of normalized target strings. Example:
{"targets": ["revenue", "gross margin", "iphone sales", "subscription growth", "ebitda"]}

Important: Be consistent with naming. Use standard financial terms."""

def extract_targets_with_gpt(text, max_retries=3):
    """Extract targets from text using GPT API with consistent normalization"""
    if not text or len(text) < 100:
        return [], []
    
    user_prompt = f"""Extract all financial and strategic targets mentioned in this earnings call transcript.

Transcript:
{text}

Return a JSON object with a "targets" array of normalized target strings. Use canonical forms (e.g., "revenue" not "revenues")."""
    
    for attempt in range(max_retries):
        try:
            # Use max_completion_tokens for newer models (gpt-5-mini), max_tokens for older models
            api_params = {
                "model": GPT_MODEL,
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt}
                ],
                "response_format": {"type": "json_object"}  # Force JSON output
            }
            
            # Check if model requires max_completion_tokens (newer models)
            if 'gpt-5' in GPT_MODEL or 'o1' in GPT_MODEL:
                api_params["max_completion_tokens"] = MAX_TOKENS
                # gpt-5 models only support temperature=1 (default), so don't set it
            else:
                api_params["max_tokens"] = MAX_TOKENS
                api_params["temperature"] = TEMPERATURE  # Only set temperature for older models
            
            response = client.chat.completions.create(**api_params)
            
            # Parse response
            content = response.choices[0].message.content.strip()
            
            # Parse JSON
            try:
                result = json.loads(content)
                if isinstance(result, dict) and 'targets' in result:
                    targets = result['targets']
                elif isinstance(result, list):
                    targets = result
                else:
                    targets = []
            except json.JSONDecodeError:
                # Fallback: try to extract array from text
                array_match = re.search(r'\[.*?\]', content, re.DOTALL)
                if array_match:
                    targets = json.loads(array_match.group())
                else:
                    targets = []
            
            # Normalize and validate targets
            targets_norm = []
            targets_raw = []
            
            for target in targets:
                if not isinstance(target, str):
                    continue
                
                # Normalize
                target_norm = target.lower().strip()
                # Remove extra whitespace
                target_norm = ' '.join(target_norm.split())
                
                # Filter by length (ensure max 10 words)
                word_count = len(target_norm.split())
                if word_count < config['nlp']['min_target_length']:
                    continue
                if word_count > 10:  # Hard limit: max 10 words
                    # Truncate to first 10 words
                    words = target_norm.split()[:10]
                    target_norm = ' '.join(words)
                if len(target_norm) > config['nlp']['max_target_length']:
                    continue
                
                # Remove common stopwords at start/end
                stopwords = ['the', 'a', 'an', 'for', 'of', 'in', 'on', 'at', 'to', 'and', 'or']
                words = target_norm.split()
                if words and words[0] in stopwords:
                    words = words[1:]
                if words and words[-1] in stopwords:
                    words = words[:-1]
                target_norm = ' '.join(words)
                
                if target_norm:
                    targets_norm.append(target_norm)
                    targets_raw.append(('GPT', target))
            
            # Remove duplicates while preserving order
            seen = set()
            targets_norm_unique = []
            targets_raw_unique = []
            for norm, raw in zip(targets_norm, targets_raw):
                if norm not in seen:
                    seen.add(norm)
                    targets_norm_unique.append(norm)
                    targets_raw_unique.append(raw)
            
            return targets_norm_unique, targets_raw_unique
            
        except Exception as e:
            error_str = str(e)
            # Handle quota errors specifically
            if '429' in error_str or 'quota' in error_str.lower() or 'insufficient_quota' in error_str.lower():
                print(f"⚠️  Quota exceeded. Please check your OpenAI billing. Error: {error_str[:200]}")
                raise  # Re-raise to stop processing
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
                continue
            print(f"Error extracting targets: {e}")
            return [], []
    
    return [], []

print("✓ GPT target extraction function defined")


✓ GPT target extraction function defined


In [57]:
# Extract targets from all transcripts using GPT API (PARALLEL PROCESSING)
# Check for existing checkpoint
checkpoint_file = INTERMEDIATE_DIR / 'targets_extraction_checkpoint.parquet'
results = []

# Load checkpoint if exists
if checkpoint_file.exists():
    print(f"Loading checkpoint from {checkpoint_file}...")
    df_checkpoint = pd.read_parquet(checkpoint_file)
    processed_call_ids = set(df_checkpoint['call_id'].unique())
    results = df_checkpoint.to_dict('records')
    print(f"  Resuming from checkpoint: {len(results):,} transcripts already processed")
else:
    processed_call_ids = set()
    print("Starting fresh extraction...")

# Create call_id if it doesn't exist
if 'call_id' not in df.columns:
    df['call_id'] = df['ticker'].astype(str) + '_' + df['fyearq'].astype(str) + '_Q' + df['fqtr'].astype(str)

# Filter to unprocessed transcripts
df_to_process = df[~df['call_id'].isin(processed_call_ids)].copy()
print(f"  Remaining to process: {len(df_to_process):,} transcripts")

# Parallel processing configuration
MAX_WORKERS = 20  # Number of parallel API calls (adjust based on your API limits)
CHECKPOINT_INTERVAL = 500  # Save checkpoint every N transcripts

print(f"\nExtracting targets using {GPT_MODEL}...")
print(f"Parallel workers: {MAX_WORKERS}")
print(f"Checkpoint interval: {CHECKPOINT_INTERVAL} transcripts")

# Thread-safe list for results
results_lock = Lock()
failed_requests = []
failed_lock = Lock()

def process_single_transcript(row_data):
    """Process a single transcript and return result"""
    row_idx, row = row_data
    text = row['text_to_process']
    call_id = row['call_id']
    
    try:
        # Extract targets
        targets_norm, targets_raw = extract_targets_with_gpt(text)
        
        return {
            'call_id': call_id,
            'ticker': row['ticker'],
            'firm_id': row.get('firm_id', row['ticker']),
            'call_date': row['call_date'],
            'fyearq': row['fyearq'],
            'fqtr': row['fqtr'],
            'firm_quarter_id': row.get('firm_quarter_id', f"{row['ticker']}_{row['fyearq']}_Q{row['fqtr']}"),
            'targets_norm': targets_norm,
            'targets_raw': targets_raw,
            'n_targets': len(targets_norm)
        }
    except Exception as e:
        with failed_lock:
            failed_requests.append({
                'call_id': call_id,
                'ticker': row['ticker'],
                'error': str(e)[:200]
            })
        return None

# Process in parallel
start_time = time.time()
rows_to_process = list(df_to_process.iterrows())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    future_to_row = {
        executor.submit(process_single_transcript, row_data): row_data
        for row_data in rows_to_process
    }
    
    # Process completed tasks with progress bar
    completed = 0
    for future in tqdm(as_completed(future_to_row), total=len(rows_to_process), desc="Extracting targets"):
        try:
            result = future.result()
            if result:
                with results_lock:
                    results.append(result)
            
            completed += 1
            
            # Save checkpoint periodically
            if completed % CHECKPOINT_INTERVAL == 0:
                with results_lock:
                    if results:
                        df_temp = pd.DataFrame(results)
                        df_temp.to_parquet(checkpoint_file, index=False, engine='pyarrow')
                        elapsed = time.time() - start_time
                        rate = completed / elapsed if elapsed > 0 else 0
                        print(f"\n  Checkpoint saved: {len(results):,} transcripts processed ({rate:.1f} transcripts/sec)")
        except Exception as e:
            row_data = future_to_row[future]
            with failed_lock:
                failed_requests.append({
                    'call_id': row_data[1]['call_id'],
                    'ticker': row_data[1]['ticker'],
                    'error': str(e)[:200]
                })

# Final save
df_targets = pd.DataFrame(results)
elapsed_total = time.time() - start_time
print(f"\n✓ Extraction complete!")
print(f"  Total transcripts processed: {len(df_targets):,}")
print(f"  Time elapsed: {elapsed_total/60:.1f} minutes")
if elapsed_total > 0:
    print(f"  Average rate: {len(df_to_process)/elapsed_total:.1f} transcripts/sec")

if failed_requests:
    print(f"  Failed requests: {len(failed_requests)}")
    df_failed = pd.DataFrame(failed_requests)
    failed_file = INTERMEDIATE_DIR / 'targets_extraction_failed.parquet'
    df_failed.to_parquet(failed_file, index=False, engine='pyarrow')
    print(f"  Failed requests saved to: {failed_file}")

# Remove checkpoint file after successful completion
if checkpoint_file.exists():
    checkpoint_file.unlink()
    print(f"  Checkpoint file cleaned up")


Starting fresh extraction...
  Remaining to process: 53,395 transcripts

Extracting targets using gpt-4o-mini...
Parallel workers: 20
Checkpoint interval: 500 transcripts


Python(4361) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Extracting targets:   1%|          | 508/53395 [00:23<40:06, 21.98it/s]  


  Checkpoint saved: 500 transcripts processed (16.5 transcripts/sec)


Extracting targets:   2%|▏         | 1003/53395 [00:45<42:19, 20.63it/s] 


  Checkpoint saved: 1,000 transcripts processed (18.9 transcripts/sec)


Extracting targets:   3%|▎         | 1501/53395 [01:10<34:50, 24.82it/s]  


  Checkpoint saved: 1,500 transcripts processed (19.3 transcripts/sec)


Extracting targets:   4%|▍         | 2004/53395 [01:34<30:30, 28.07it/s]  


  Checkpoint saved: 2,000 transcripts processed (19.7 transcripts/sec)


Extracting targets:   5%|▍         | 2504/53395 [01:57<36:31, 23.22it/s]  


  Checkpoint saved: 2,500 transcripts processed (20.1 transcripts/sec)


Extracting targets:   6%|▌         | 3000/53395 [02:18<37:12, 22.57it/s]  


  Checkpoint saved: 3,000 transcripts processed (20.5 transcripts/sec)


Extracting targets:   7%|▋         | 3504/53395 [02:39<39:08, 21.24it/s]


  Checkpoint saved: 3,500 transcripts processed (21.0 transcripts/sec)


Extracting targets:   8%|▊         | 4005/53395 [02:58<31:33, 26.08it/s]


  Checkpoint saved: 4,000 transcripts processed (21.5 transcripts/sec)


Extracting targets:   8%|▊         | 4504/53395 [03:19<41:22, 19.70it/s]


  Checkpoint saved: 4,500 transcripts processed (21.8 transcripts/sec)


Extracting targets:   9%|▉         | 5005/53395 [03:39<36:58, 21.81it/s]


  Checkpoint saved: 5,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  10%|█         | 5502/53395 [04:03<45:05, 17.70it/s]  


  Checkpoint saved: 5,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  11%|█         | 6003/53395 [04:24<24:38, 32.05it/s]  


  Checkpoint saved: 6,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  12%|█▏        | 6500/53395 [04:47<37:30, 20.83it/s]  


  Checkpoint saved: 6,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  13%|█▎        | 7002/53395 [05:12<33:53, 22.81it/s]  


  Checkpoint saved: 7,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  14%|█▍        | 7505/53395 [05:33<25:59, 29.43it/s]  


  Checkpoint saved: 7,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  15%|█▍        | 8004/53395 [05:55<38:37, 19.58it/s]


  Checkpoint saved: 8,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  16%|█▌        | 8505/53395 [06:19<28:10, 26.55it/s]


  Checkpoint saved: 8,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  17%|█▋        | 9005/53395 [06:39<23:08, 31.97it/s]


  Checkpoint saved: 9,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  18%|█▊        | 9500/53395 [07:02<45:18, 16.15it/s]  


  Checkpoint saved: 9,500 transcripts processed (22.1 transcripts/sec)


Extracting targets:  19%|█▊        | 10005/53395 [07:25<31:30, 22.95it/s]


  Checkpoint saved: 10,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  20%|█▉        | 10507/53395 [07:47<27:21, 26.13it/s]


  Checkpoint saved: 10,500 transcripts processed (22.1 transcripts/sec)


Extracting targets:  21%|██        | 11001/53395 [08:09<55:58, 12.62it/s]


  Checkpoint saved: 11,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  22%|██▏       | 11505/53395 [08:31<31:11, 22.38it/s]  


  Checkpoint saved: 11,500 transcripts processed (22.2 transcripts/sec)


Extracting targets:  22%|██▏       | 12000/53395 [08:55<36:38, 18.83it/s]


  Checkpoint saved: 12,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  23%|██▎       | 12504/53395 [09:18<30:57, 22.02it/s]


  Checkpoint saved: 12,500 transcripts processed (22.1 transcripts/sec)


Extracting targets:  24%|██▍       | 13003/53395 [09:42<28:54, 23.29it/s]


  Checkpoint saved: 13,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  25%|██▌       | 13503/53395 [10:09<38:23, 17.32it/s]


  Checkpoint saved: 13,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  26%|██▌       | 14000/53395 [10:30<20:41, 31.74it/s]


  Checkpoint saved: 14,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  27%|██▋       | 14508/53395 [10:55<21:22, 30.32it/s]


  Checkpoint saved: 14,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  28%|██▊       | 15006/53395 [11:18<26:01, 24.58it/s]


  Checkpoint saved: 15,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  29%|██▉       | 15505/53395 [11:41<25:20, 24.92it/s]


  Checkpoint saved: 15,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  30%|██▉       | 16007/53395 [12:04<31:31, 19.76it/s]


  Checkpoint saved: 16,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  31%|███       | 16510/53395 [12:27<19:01, 32.31it/s]


  Checkpoint saved: 16,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  32%|███▏      | 17002/53395 [12:47<23:21, 25.98it/s]


  Checkpoint saved: 17,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  33%|███▎      | 17504/53395 [13:12<22:30, 26.58it/s]  


  Checkpoint saved: 17,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  34%|███▎      | 18007/53395 [13:35<20:40, 28.52it/s]


  Checkpoint saved: 18,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  35%|███▍      | 18506/53395 [13:57<25:32, 22.77it/s]


  Checkpoint saved: 18,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  36%|███▌      | 19009/53395 [14:17<17:05, 33.53it/s]


  Checkpoint saved: 19,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  37%|███▋      | 19506/53395 [14:39<26:02, 21.70it/s]


  Checkpoint saved: 19,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  37%|███▋      | 20012/53395 [15:00<19:48, 28.09it/s]


  Checkpoint saved: 20,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  38%|███▊      | 20501/53395 [15:22<43:33, 12.59it/s]


  Checkpoint saved: 20,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  39%|███▉      | 21007/53395 [15:46<17:41, 30.52it/s]


  Checkpoint saved: 21,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  40%|████      | 21505/53395 [16:08<23:01, 23.08it/s]


  Checkpoint saved: 21,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  41%|████      | 22005/53395 [16:29<25:51, 20.23it/s]


  Checkpoint saved: 22,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  42%|████▏     | 22507/53395 [16:54<23:20, 22.05it/s]


  Checkpoint saved: 22,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  43%|████▎     | 23010/53395 [17:17<14:42, 34.42it/s]


  Checkpoint saved: 23,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  44%|████▍     | 23503/53395 [17:41<33:08, 15.03it/s]


  Checkpoint saved: 23,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  45%|████▍     | 24003/53395 [18:05<21:57, 22.30it/s]


  Checkpoint saved: 24,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  46%|████▌     | 24505/53395 [18:26<21:42, 22.19it/s]


  Checkpoint saved: 24,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  47%|████▋     | 25005/53395 [18:50<18:48, 25.15it/s]


  Checkpoint saved: 25,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  48%|████▊     | 25506/53395 [19:11<22:50, 20.35it/s]


  Checkpoint saved: 25,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  49%|████▊     | 26002/53395 [19:32<17:47, 25.67it/s]


  Checkpoint saved: 26,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  50%|████▉     | 26510/53395 [19:55<13:05, 34.25it/s]


  Checkpoint saved: 26,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  51%|█████     | 27009/53395 [20:18<12:25, 35.38it/s]


  Checkpoint saved: 27,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  52%|█████▏    | 27506/53395 [20:39<19:43, 21.88it/s]


  Checkpoint saved: 27,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  52%|█████▏    | 28005/53395 [21:01<26:21, 16.05it/s]


  Checkpoint saved: 28,000 transcripts processed (22.1 transcripts/sec)


Extracting targets:  53%|█████▎    | 28505/53395 [21:26<22:20, 18.57it/s]


  Checkpoint saved: 28,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  54%|█████▍    | 29006/53395 [21:50<15:11, 26.77it/s]


  Checkpoint saved: 29,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  55%|█████▌    | 29504/53395 [22:14<24:04, 16.53it/s]


  Checkpoint saved: 29,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  56%|█████▌    | 30006/53395 [22:36<15:41, 24.83it/s]


  Checkpoint saved: 30,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  57%|█████▋    | 30510/53395 [23:01<13:19, 28.61it/s]


  Checkpoint saved: 30,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  58%|█████▊    | 31002/53395 [23:23<23:04, 16.17it/s]


  Checkpoint saved: 31,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  59%|█████▉    | 31504/53395 [23:47<25:13, 14.46it/s]


  Checkpoint saved: 31,500 transcripts processed (22.0 transcripts/sec)


Extracting targets:  60%|█████▉    | 32007/53395 [24:10<17:49, 19.99it/s]


  Checkpoint saved: 32,000 transcripts processed (22.0 transcripts/sec)


Extracting targets:  61%|██████    | 32506/53395 [24:34<17:30, 19.88it/s]


  Checkpoint saved: 32,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  62%|██████▏   | 33008/53395 [24:58<12:05, 28.12it/s]


  Checkpoint saved: 33,000 transcripts processed (21.9 transcripts/sec)


Extracting targets:  63%|██████▎   | 33506/53395 [25:23<15:30, 21.38it/s]


  Checkpoint saved: 33,500 transcripts processed (21.9 transcripts/sec)


Extracting targets:  64%|██████▎   | 34002/53395 [25:48<18:39, 17.32it/s]


  Checkpoint saved: 34,000 transcripts processed (21.8 transcripts/sec)


Extracting targets:  65%|██████▍   | 34506/53395 [26:14<15:56, 19.74it/s]


  Checkpoint saved: 34,500 transcripts processed (21.8 transcripts/sec)


Extracting targets:  66%|██████▌   | 35004/53395 [26:43<14:23, 21.29it/s]


  Checkpoint saved: 35,000 transcripts processed (21.7 transcripts/sec)


Extracting targets:  66%|██████▋   | 35502/53395 [27:07<19:12, 15.53it/s]


  Checkpoint saved: 35,500 transcripts processed (21.7 transcripts/sec)


Extracting targets:  67%|██████▋   | 36004/53395 [27:29<19:34, 14.81it/s]


  Checkpoint saved: 36,000 transcripts processed (21.7 transcripts/sec)


Extracting targets:  68%|██████▊   | 36511/53395 [27:54<07:21, 38.20it/s]


  Checkpoint saved: 36,500 transcripts processed (21.7 transcripts/sec)


Extracting targets:  69%|██████▉   | 37009/53395 [28:20<10:27, 26.09it/s]


  Checkpoint saved: 37,000 transcripts processed (21.7 transcripts/sec)


Extracting targets:  70%|███████   | 37505/53395 [28:44<14:43, 17.98it/s]


  Checkpoint saved: 37,500 transcripts processed (21.7 transcripts/sec)


Extracting targets:  71%|███████   | 38005/53395 [29:10<14:16, 17.97it/s]


  Checkpoint saved: 38,000 transcripts processed (21.6 transcripts/sec)


Extracting targets:  72%|███████▏  | 38505/53395 [29:38<12:18, 20.17it/s]


  Checkpoint saved: 38,500 transcripts processed (21.6 transcripts/sec)


Extracting targets:  73%|███████▎  | 39007/53395 [30:03<12:46, 18.76it/s]


  Checkpoint saved: 39,000 transcripts processed (21.5 transcripts/sec)


Extracting targets:  74%|███████▍  | 39507/53395 [30:26<11:01, 20.98it/s]


  Checkpoint saved: 39,500 transcripts processed (21.5 transcripts/sec)


Extracting targets:  75%|███████▍  | 40008/53395 [30:53<09:23, 23.78it/s]


  Checkpoint saved: 40,000 transcripts processed (21.5 transcripts/sec)


Extracting targets:  76%|███████▌  | 40506/53395 [31:18<15:54, 13.51it/s]


  Checkpoint saved: 40,500 transcripts processed (21.5 transcripts/sec)


Extracting targets:  77%|███████▋  | 41008/53395 [31:44<07:37, 27.05it/s]


  Checkpoint saved: 41,000 transcripts processed (21.4 transcripts/sec)


Extracting targets:  78%|███████▊  | 41505/53395 [32:09<11:46, 16.83it/s]


  Checkpoint saved: 41,500 transcripts processed (21.4 transcripts/sec)


Extracting targets:  79%|███████▊  | 42004/53395 [32:32<10:06, 18.78it/s]


  Checkpoint saved: 42,000 transcripts processed (21.4 transcripts/sec)


Extracting targets:  80%|███████▉  | 42507/53395 [32:56<08:14, 22.04it/s]


  Checkpoint saved: 42,500 transcripts processed (21.4 transcripts/sec)


Extracting targets:  81%|████████  | 43007/53395 [33:20<10:07, 17.10it/s]


  Checkpoint saved: 43,000 transcripts processed (21.4 transcripts/sec)


Extracting targets:  81%|████████▏ | 43506/53395 [33:46<08:37, 19.12it/s]


  Checkpoint saved: 43,500 transcripts processed (21.4 transcripts/sec)


Extracting targets:  82%|████████▏ | 44004/53395 [34:13<08:21, 18.71it/s]


  Checkpoint saved: 44,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  83%|████████▎ | 44512/53395 [34:38<04:32, 32.63it/s]


  Checkpoint saved: 44,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  84%|████████▍ | 45000/53395 [35:02<09:38, 14.51it/s]


  Checkpoint saved: 45,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  85%|████████▌ | 45508/53395 [35:28<05:51, 22.45it/s]


  Checkpoint saved: 45,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  86%|████████▌ | 46006/53395 [35:52<06:09, 20.01it/s]


  Checkpoint saved: 46,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  87%|████████▋ | 46501/53395 [36:18<07:17, 15.74it/s]


  Checkpoint saved: 46,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  88%|████████▊ | 47012/53395 [36:44<03:24, 31.25it/s]


  Checkpoint saved: 47,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  89%|████████▉ | 47512/53395 [37:07<03:59, 24.59it/s]


  Checkpoint saved: 47,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  90%|████████▉ | 48007/53395 [37:29<03:18, 27.13it/s]


  Checkpoint saved: 48,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  91%|█████████ | 48509/53395 [37:51<04:10, 19.51it/s]


  Checkpoint saved: 48,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  92%|█████████▏| 49015/53395 [38:14<01:56, 37.64it/s]


  Checkpoint saved: 49,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  93%|█████████▎| 49505/53395 [38:39<03:20, 19.45it/s]


  Checkpoint saved: 49,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  94%|█████████▎| 50017/53395 [39:01<02:03, 27.46it/s]


  Checkpoint saved: 50,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  95%|█████████▍| 50510/53395 [39:25<01:56, 24.68it/s]


  Checkpoint saved: 50,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  96%|█████████▌| 51011/53395 [39:46<01:36, 24.73it/s]


  Checkpoint saved: 51,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  96%|█████████▋| 51501/53395 [40:08<02:09, 14.60it/s]


  Checkpoint saved: 51,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  97%|█████████▋| 52009/53395 [40:30<00:51, 27.18it/s]


  Checkpoint saved: 52,000 transcripts processed (21.3 transcripts/sec)


Extracting targets:  98%|█████████▊| 52505/53395 [40:52<00:45, 19.39it/s]


  Checkpoint saved: 52,500 transcripts processed (21.3 transcripts/sec)


Extracting targets:  99%|█████████▉| 53010/53395 [41:15<00:17, 22.51it/s]


  Checkpoint saved: 53,000 transcripts processed (21.4 transcripts/sec)


Extracting targets: 100%|██████████| 53395/53395 [41:33<00:00, 21.42it/s]


✓ Extraction complete!
  Total transcripts processed: 53,395
  Time elapsed: 41.7 minutes
  Average rate: 21.4 transcripts/sec
  Checkpoint file cleaned up





In [59]:
# Diagnostics
print("Target Extraction Diagnostics:")
print(f"  Transcripts with zero targets: {(df_targets['n_targets'] == 0).sum():,} ({(df_targets['n_targets'] == 0).mean()*100:.1f}%)")
print(f"  Average targets per transcript: {df_targets['n_targets'].mean():.2f}")
print(f"  Median targets per transcript: {df_targets['n_targets'].median():.0f}")
print(f"  Max targets in a transcript: {df_targets['n_targets'].max()}")

# Distribution of target counts
print(f"\nDistribution of n_targets:")
print(df_targets['n_targets'].describe())

# Top targets overall
all_targets = []
for targets_list in df_targets['targets_norm']:
    if isinstance(targets_list, list):
        all_targets.extend(targets_list)

target_counts = Counter(all_targets)
print(f"\nTop 50 targets overall (showing consistency):")
for target, count in target_counts.most_common(50):
    print(f"  {target}: {count}")

# Check for consistency (same targets should have same names)
print(f"\nConsistency check:")
print(f"  Total unique targets: {len(target_counts):,}")
print(f"  Targets appearing in >10 transcripts: {sum(1 for c in target_counts.values() if c > 10):,}")
print(f"  Targets appearing in >100 transcripts: {sum(1 for c in target_counts.values() if c > 100):,}")



Target Extraction Diagnostics:
  Transcripts with zero targets: 23,271 (43.6%)
  Average targets per transcript: 1.31
  Median targets per transcript: 1
  Max targets in a transcript: 14

Distribution of n_targets:
count    53395.000000
mean         1.306920
std          1.575188
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max         14.000000
Name: n_targets, dtype: float64

Top 50 targets overall (showing consistency):
  earnings per share: 4340
  free cash flow: 2575
  adjusted earnings per share: 939
  adjusted net income: 674
  net interest margin: 668
  adjusted ebitda margin: 654
  diluted earnings per share: 593
  operating cash flow: 574
  organic revenue growth: 462
  net interest income: 390
  return on equity: 364
  top line growth: 285
  return on average assets: 256
  organic sales growth: 254
  earnings per diluted share: 228
  adjusted operating income: 214
  gross profit margin: 213
  adjusted diluted earnings per share: 210

In [60]:
# Save extracted targets
output_file = config['data']['targets_extracted']
df_targets.to_parquet(output_file, index=False, engine='pyarrow')
print(f"\n✓ Saved extracted targets to: {output_file}")
print(f"  Total transcripts: {len(df_targets):,}")
print(f"  Total unique targets: {len(target_counts):,}")
print(f"  Average targets per transcript: {df_targets['n_targets'].mean():.2f}")

# Also save target vocabulary for reference
vocab_file = INTERMEDIATE_DIR / 'target_vocabulary.json'
vocab_data = {
    'target_counts': dict(target_counts.most_common()),
    'total_unique_targets': len(target_counts),
    'extraction_model': GPT_MODEL,
    'extraction_date': pd.Timestamp.now().isoformat()
}
with open(vocab_file, 'w') as f:
    json.dump(vocab_data, f, indent=2)
print(f"  Target vocabulary saved to: {vocab_file}")



✓ Saved extracted targets to: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate/targets_extracted.parquet
  Total transcripts: 53,395
  Total unique targets: 29,295
  Average targets per transcript: 1.31
  Target vocabulary saved to: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate/target_vocabulary.json
