# Step 0: Environment Setup & Configuration

This notebook sets up the environment, dependencies, and configuration for the Moving Targets strategy implementation.

Based on: "Moving Targets" by Cohen & Nguyen (2024) - SSRN 4736129


In [None]:
# Install required packages (run once)
# !pip install pandas numpy pyarrow scipy statsmodels matplotlib seaborn
# !pip install polars  # Much faster than pandas for large datasets (10-100x speedup)
# !pip install spacy tqdm joblib
# !python -m spacy download en_core_web_sm
# !pip install certifi requests


In [13]:
import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up paths
BASE_DIR = Path('/Users/david/Desktop/MATH-GA 2707/Moving Target')
DATA_DIR = BASE_DIR / 'data'
INTERMEDIATE_DIR = DATA_DIR / 'intermediate'
PROCESSED_DIR = DATA_DIR / 'processed'
CONFIG_DIR = BASE_DIR / 'configs'

# Create directories
for dir_path in [DATA_DIR, INTERMEDIATE_DIR, PROCESSED_DIR, CONFIG_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("Directory structure created:")
print(f"  Base: {BASE_DIR}")
print(f"  Data: {DATA_DIR}")
print(f"  Intermediate: {INTERMEDIATE_DIR}")
print(f"  Processed: {PROCESSED_DIR}")
print(f"  Configs: {CONFIG_DIR}")


Directory structure created:
  Base: /Users/david/Desktop/MATH-GA 2707/Moving Target
  Data: /Users/david/Desktop/MATH-GA 2707/Moving Target/data
  Intermediate: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate
  Processed: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/processed
  Configs: /Users/david/Desktop/MATH-GA 2707/Moving Target/configs


In [None]:
# Configuration dictionary
config = {
    'api': {
        'fmp_api_key': 'API_KEY',  # Replace with your API key
        'base_url': 'https://financialmodelingprep.com/stable'
    },
    'data': {
        'russell_3000_file': BASE_DIR / 'russell-3000.csv',
        'transcripts_raw': INTERMEDIATE_DIR / 'transcripts_raw.parquet',
        'transcripts_clean': INTERMEDIATE_DIR / 'transcripts_clean.parquet',
        'targets_extracted': INTERMEDIATE_DIR / 'targets_extracted.parquet',
        'targets_panel': INTERMEDIATE_DIR / 'targets_panel.parquet',
        'firm_quarter_signal': INTERMEDIATE_DIR / 'firm_quarter_signal.parquet',
        'monthly_signal': INTERMEDIATE_DIR / 'monthly_signal.parquet',
        'universe_monthly': INTERMEDIATE_DIR / 'universe_monthly.parquet',
        'backtest_returns': PROCESSED_DIR / 'backtest_returns.parquet',
        'holdings': PROCESSED_DIR / 'holdings.parquet'
    },
    'nlp': {
        'model': 'en_core_web_sm',
        'batch_size': 100,
        'min_target_length': 3,
        'max_target_length': 50
    },
    'trading': {
        'universe': 'russell_3000',
        'min_price': 5.0,
        'quantiles': 10,  # deciles
        'rebalance_freq': 'monthly'
    },
    'dates': {
        'start_year': 2010,
        'end_year': 2024,
        'lookback_quarters': 4  # For MT calculation (t-4)
    }
}

# Save config to JSON
config_file = CONFIG_DIR / 'base.json'
with open(config_file, 'w') as f:
    json.dump({k: str(v) if isinstance(v, Path) else v for k, v in config.items()}, f, indent=2, default=str)

print("Configuration saved to:", config_file)
print("\nKey settings:")
print(f"  API Key: {config['api']['fmp_api_key'][:10]}...")
print(f"  Date range: {config['dates']['start_year']}-{config['dates']['end_year']}")
print(f"  Trading universe: {config['trading']['universe']}")
print(f"  Quantiles: {config['trading']['quantiles']}")


Configuration saved to: /Users/david/Desktop/MATH-GA 2707/Moving Target/configs/base.json

Key settings:
  API Key: wSb1mJ4mrG...
  Date range: 2010-2024
  Trading universe: russell_3000
  Quantiles: 10


In [15]:
# Load Russell 3000 universe
# Header is on line 10 (1-indexed), so skip 9 rows (0-indexed) to keep header
russell_df = pd.read_csv(config['data']['russell_3000_file'], skiprows=9)
# Check if Ticker column exists, if not try alternative column names
if 'Ticker' not in russell_df.columns:
    # Try to find the ticker column (might be first column or have different name)
    print(f"Available columns: {list(russell_df.columns)}")
    # If first column looks like tickers, use it
    if len(russell_df.columns) > 0:
        russell_df.rename(columns={russell_df.columns[0]: 'Ticker'}, inplace=True)

russell_df = russell_df[russell_df['Ticker'].notna()].copy()
# Clean ticker column - remove quotes and whitespace
russell_df['Ticker'] = russell_df['Ticker'].astype(str).str.strip().str.replace('"', '')
# Filter out any empty strings
russell_df = russell_df[russell_df['Ticker'] != ''].copy()
russell_tickers = set(russell_df['Ticker'].unique())

print(f"Russell 3000 universe loaded: {len(russell_tickers)} unique tickers")
print(f"\nSample tickers: {sorted(list(russell_tickers))[:20]}")

# Save ticker list for later use
ticker_file = INTERMEDIATE_DIR / 'russell_3000_tickers.json'
with open(ticker_file, 'w') as f:
    json.dump(sorted(list(russell_tickers)), f, indent=2)
print(f"\nTicker list saved to: {ticker_file}")


Russell 3000 universe loaded: 2611 unique tickers

Sample tickers: ['-', 'A', 'AA', 'AAL', 'AAN', 'AAON', 'AAP', 'AAPL', 'AAT', 'AAWW', 'ABBV', 'ABC', 'ABCB', 'ABCL', 'ABG', 'ABM', 'ABNB', 'ABR', 'ABT', 'ABUS']

Ticker list saved to: /Users/david/Desktop/MATH-GA 2707/Moving Target/data/intermediate/russell_3000_tickers.json
