# LLM-Augmented Technical Analysis - Colab Demo

This notebook runs the complete pipeline on Google Colab using:
- **OpenAI GPT-4o-mini** for technical analysis
- **FinBERT** for text embeddings
- **ResNet18** fusion model for prediction

**Dataset**: SPY 2015-2016 (reduced size for cost savings)

## 1. Setup

In [None]:
# Install dependencies
!pip install -q yfinance pandas numpy scipy matplotlib mplfinance pillow scikit-learn torch torchvision transformers accelerate tqdm pyyaml

In [None]:
# Clone the repo (or upload files)
!git clone https://github.com/YOUR_USERNAME/technical-analysis.git 2>/dev/null || echo "Repo exists or using uploaded files"

import os
import sys

# Change to project directory
if os.path.exists('technical-analysis'):
    os.chdir('technical-analysis')

sys.path.insert(0, '.')
print(f"Working directory: {os.getcwd()}")

In [None]:
# Set up OpenAI API Key
# Get your API key at: https://platform.openai.com/api-keys

from google.colab import userdata

# Option 1: Use Colab secrets (recommended - more secure)
# Go to: Settings (gear icon) > Secrets > Add new secret
# Name: OPENAI_API_KEY, Value: your-key-here
try:
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    print("âœ“ API key loaded from Colab secrets")
except:
    # Option 2: Set manually (less secure, visible in notebook)
    os.environ['OPENAI_API_KEY'] = "sk-your-api-key-here"  # Replace with your key
    print("âš  Set OPENAI_API_KEY manually - consider using Colab secrets instead")

In [None]:
# Create necessary directories
!mkdir -p data/raw data/samples data/images data/llm data/embeddings data/models configs

## 2. Configuration

Using reduced dataset to minimize LLM API calls:
- Train: 2015 (~50 samples with stride=5)
- Val: Jan-Jun 2016 (~25 samples)
- Test: Jul-Dec 2016 (~25 samples)
- Total: ~100 samples = ~100 Gemini API calls

In [None]:
import yaml

config = {
    # Data source
    'ticker': 'SPY',
    'start_date': '2014-01-01',
    'end_date': '2017-12-31',
    
    # Dataset parameters
    'lookback_days': 30,
    'horizon_days': 5,
    'image_size': 112,
    'stride': 5,  # Larger stride = fewer samples = fewer API calls
    'label_rule': 'forward_return_gt_0',
    
    # Splits - smaller dataset
    'train_start': '2015-01-01',
    'train_end': '2015-12-31',
    'val_start': '2016-01-01',
    'val_end': '2016-06-30',
    'test_start': '2016-07-01',
    'test_end': '2016-12-31',
    
    # Chart rendering
    'include_volume': True,
    'include_rsi_panel': True,
    'overlay_sma20': True,
    'overlay_bollinger': True,
    'bb_window': 20,
    'bb_k': 2,
    'rsi_window': 14,
    'hide_axes_labels': True,
    'no_titles': True,
    
    # LLM - using OpenAI
    'llm_enabled': True,
    'llm_provider': 'openai',
    'llm_model': 'gpt-4o-mini',
    'temperature': 0,
    'max_tokens': 220,
    'cache_path': 'data/llm/spy_analysis.jsonl',
    'llm_rate_limit_delay': 0.5,  # OpenAI has higher rate limits
    
    # FinBERT
    'finbert_model': 'ProsusAI/finbert',
    'embedding_dim': 768,
    
    # Training
    'batch_size': 16,
    'epochs': 15,
    'lr': 1e-4,
    'weight_decay': 1e-4,
    'early_stop_patience': 5,
    'seed': 42,
    
    # Fusion model
    'use_numeric_features': True,
    'use_text_embeddings': True,
    'cnn_backbone': 'resnet18',
    'dropout': 0.3,
}

# Save config
with open('configs/config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("Config saved. Key settings:")
print(f"  Stride: {config['stride']} (reduces samples by ~{config['stride']}x)")
print(f"  Train: {config['train_start']} to {config['train_end']}")
print(f"  Test: {config['test_start']} to {config['test_end']}")
print(f"  LLM: {config['llm_provider']} / {config['llm_model']}")

## 3. Fetch Data & Compute Indicators

In [None]:
from src.data_fetch import fetch_ohlcv, save_ohlcv
from src.features import compute_all_indicators
from pathlib import Path

# Fetch OHLCV data
df = fetch_ohlcv(config['ticker'], config['start_date'], config['end_date'])
save_ohlcv(df, f"data/raw/{config['ticker']}.csv")

# Compute indicators
df_indicators = compute_all_indicators(df, config)
df_indicators.to_csv(f"data/raw/{config['ticker']}_indicators.csv", index=False)

print(f"\nData shape: {df_indicators.shape}")
df_indicators[['Date', 'Close', 'sma20', 'rsi14', 'bb_percent_b']].tail()

## 4. Build Samples & Render Charts

In [None]:
from src.render_charts import build_samples, render_candlestick_chart
from tqdm import tqdm
import pandas as pd

# Build samples
samples_df = build_samples(df_indicators, config)
samples_df.to_parquet("data/samples/samples.parquet")

print(f"Total samples: {len(samples_df)}")
print(f"\nSplit distribution:")
for split in ['train', 'val', 'test']:
    count = len(samples_df[samples_df['split'] == split])
    print(f"  {split}: {count}")

print(f"\nEstimated LLM API calls: {len(samples_df)}")
print(f"Estimated time at 4s/call: {len(samples_df) * 4 / 60:.1f} minutes")

In [None]:
# Render chart images
print("Rendering chart images...")
for _, sample in tqdm(samples_df.iterrows(), total=len(samples_df)):
    sample_id = sample['sample_id']
    window_df = df_indicators.iloc[sample['start_idx']:sample['end_idx']+1].copy()
    render_candlestick_chart(window_df, config, f"data/images/{sample_id}.png")

print(f"\nRendered {len(samples_df)} chart images")

In [None]:
# Display sample chart
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import os

# Check if images were created
image_dir = "data/images"
image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')] if os.path.exists(image_dir) else []
print(f"Images created: {len(image_files)}")

if len(image_files) == 0:
    print("No images found! Make sure the previous cell ran successfully.")
else:
    # Display a sample image
    sample_id = samples_df.iloc[len(samples_df)//2]['sample_id']  # Middle sample
    img_path = f"data/images/{sample_id}.png"
    
    if os.path.exists(img_path):
        img = Image.open(img_path)
        fig, ax = plt.subplots(figsize=(5, 5))
        ax.imshow(img)
        ax.axis('off')
        ax.set_title(f"Sample {sample_id} - No text, dates, or future info")
        plt.tight_layout()
        plt.show()
        print(f"\nImage size: {img.size}")
    else:
        print(f"Image not found at: {img_path}")
        print(f"Available images: {image_files[:5]}...")

## 5. LLM Technical Analysis (OpenAI)

This step calls the OpenAI API for each sample. With ~100 samples and 0.5s delay, this takes ~1-2 minutes.

**Note**: Results are cached in `data/llm/spy_analysis.jsonl`. If you rerun, it will skip already-processed samples.

In [None]:
# Test OpenAI API connection first
from src.llm_analyze import call_openai

test_response = call_openai(
    "Return a JSON object with key 'status' and value 'ok'",
    "You are a helpful assistant. Return only valid JSON.",
    model=config['llm_model']
)

if test_response:
    print(f"âœ“ OpenAI API working! Response: {test_response[:100]}...")
else:
    print("âœ— ERROR: OpenAI API not working. Check your OPENAI_API_KEY.")

In [None]:
# Run LLM analysis
!python -m src.llm_analyze --config configs/config.yaml

In [None]:
# Check LLM outputs
import json

with open('data/llm/spy_analysis.jsonl', 'r') as f:
    lines = f.readlines()

print(f"LLM analyses generated: {len(lines)}")

# Show a sample
if lines:
    sample = json.loads(lines[len(lines)//2])
    print(f"\nSample analysis (id={sample['sample_id']}):")
    if sample.get('analysis_json'):
        print(json.dumps(sample['analysis_json'], indent=2))

## 6. FinBERT Embeddings

In [None]:
!python -m src.finbert_embed --config configs/config.yaml

## 7. Train Model

In [None]:
!python -m src.train --config configs/config.yaml

## 8. Evaluate

In [None]:
!python -m src.eval --config configs/config.yaml

## 9. Backtest

In [None]:
!python -m src.backtest --config configs/config.yaml

## 10. Results Summary

In [None]:
import json

# Load all results
with open('data/models/metrics.json') as f:
    train_metrics = json.load(f)

with open('data/models/eval_results.json') as f:
    eval_results = json.load(f)

with open('data/models/backtest_results.json') as f:
    backtest = json.load(f)

print("="*60)
print("FINAL RESULTS")
print("="*60)

print(f"\nðŸ“Š Model Performance:")
print(f"  Validation AUC: {train_metrics['best_val_auc']:.4f}")
print(f"  Test AUC:       {eval_results['test']['auc']:.4f}")
print(f"  Test Accuracy:  {eval_results['test']['accuracy']:.4f}")

print(f"\nðŸ’° Backtest Results:")
strat = backtest['strategy']
print(f"  CAGR:           {strat['cagr']*100:.2f}%")
print(f"  Sharpe Ratio:   {strat['sharpe']:.2f}")
print(f"  Max Drawdown:   {strat['max_drawdown']*100:.2f}%")
print(f"  Hit Rate:       {strat['hit_rate']*100:.2f}%")
print(f"  Trades:         {strat['num_trades']}")

bh = backtest['buy_and_hold']
print(f"\nðŸ“ˆ Buy & Hold Benchmark:")
print(f"  Total Return:   {bh['total_return']*100:.2f}%")

excess = strat['total_return'] - bh['total_return']
print(f"\nðŸŽ¯ Excess Return: {excess*100:+.2f}%")

In [None]:
# Plot equity curve
import matplotlib.pyplot as plt
import numpy as np

equity = np.array(backtest['equity_curve'])

plt.figure(figsize=(10, 5))
plt.plot(equity, label='Strategy')
plt.axhline(y=10000, color='gray', linestyle='--', alpha=0.5, label='Initial Capital')
plt.xlabel('Time')
plt.ylabel('Equity ($)')
plt.title('Backtest Equity Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()