# 01 â€” Data Collection & Exploration

This notebook handles data acquisition and initial exploration for the StockBuddy Forecast project. We will:
1. Fetch S&P 500 constituents
2. Select a manageable subset of diverse stocks
3. Download historical price data and fundamentals
4. Build price and returns matrices
5. Perform initial exploratory analysis
6. Save processed data for downstream tasks

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_loader import get_sp500_tickers, fetch_multiple_stocks, fetch_multiple_fundamentals, build_price_matrix, build_returns_matrix

# Display settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%matplotlib inline

## 1. Fetch S&P 500 Constituents

In [None]:
sp500 = get_sp500_tickers()
print(f"Total S&P 500 stocks: {len(sp500)}")
sp500.head(10)

## 2. Select Stocks for Analysis

We select a manageable subset of ~30-50 diverse stocks across different sectors and market capitalizations. This allows us to balance computational efficiency with sector representation for building robust predictive models.

In [None]:
# Select ~30-40 diverse stocks across sectors
SELECTED_TICKERS = [
    # Technology
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'AMD', 'INTC', 'CRM', 'NFLX',
    # Financial Services
    'JPM', 'GS', 'BAC', 'WFC', 'V', 'MA',
    # Healthcare & Pharma
    'JNJ', 'UNH', 'LLY', 'ABBV', 'MRK', 'PFE',
    # Energy
    'XOM', 'CVX', 'COP', 'MPC',
    # Consumer Staples & Discretionary
    'PG', 'KO', 'PEP', 'WMT', 'HD', 'DIS',
    # Industrial & Other
    'BA', 'GE', 'CAT', 'AVGO', 'TSLA'
]

# Keep reference to all S&P 500 stocks for full analysis if desired
ALL_TICKERS = sp500.tolist()

print(f"Selected {len(SELECTED_TICKERS)} stocks for analysis")
print(f"Tickers: {SELECTED_TICKERS}")

## 3. Download Historical Price Data

In [None]:
stock_data = fetch_multiple_stocks(SELECTED_TICKERS, period="2y")

print(f"\nData shapes by ticker:")
for ticker, df in stock_data.items():
    print(f"{ticker}: {df.shape}")

# Show sample data for one stock
print(f"\nSample data for AAPL:")
print(stock_data['AAPL'].head())

## 4. Download Fundamental Data

In [None]:
fundamentals_df = fetch_multiple_fundamentals(SELECTED_TICKERS)
print(f"Fundamentals shape: {fundamentals_df.shape}")
print(f"\nColumns: {fundamentals_df.columns.tolist()}")
fundamentals_df.head()

## 5. Build Price & Returns Matrices

In [None]:
price_matrix = build_price_matrix(stock_data)
returns_matrix = build_returns_matrix(stock_data)

print(f"Price matrix shape: {price_matrix.shape}")
print(f"Returns matrix shape: {returns_matrix.shape}")
print(f"\nPrice matrix (first 5 rows):")
print(price_matrix.head())
print(f"\nReturns matrix (first 5 rows):")
print(returns_matrix.head())

## 6. Initial Data Exploration

In [None]:
# Plot normalized closing prices for sample stocks
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Normalize prices to 100
sample_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'TSLA']
normalized_prices = price_matrix[sample_tickers].div(price_matrix[sample_tickers].iloc[0]) * 100

normalized_prices.plot(ax=axes[0], linewidth=2)
axes[0].set_title('Normalized Stock Prices (Base = 100)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Normalized Price')
axes[0].legend(loc='best')
axes[0].grid(alpha=0.3)

# Plot daily returns distribution for AAPL
returns_matrix['AAPL'].hist(bins=50, ax=axes[1], edgecolor='black', alpha=0.7)
axes[1].set_title('AAPL Daily Returns Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Daily Return')
axes[1].set_ylabel('Frequency')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nDaily Returns Summary Statistics:")
print(returns_matrix.describe())

## 7. Save Data for Next Steps

In [None]:
import os

# Create data directories if they don't exist
os.makedirs('../data/raw', exist_ok=True)

# Save matrices
price_matrix.to_parquet('../data/raw/price_matrix.parquet')
returns_matrix.to_parquet('../data/raw/returns_matrix.parquet')
fundamentals_df.to_parquet('../data/raw/fundamentals.parquet')

# Save selected tickers
pd.DataFrame({'ticker': SELECTED_TICKERS}).to_csv('../data/raw/selected_tickers.csv', index=False)

print("Data saved successfully!")
print(f"  - price_matrix.parquet ({price_matrix.shape})")
print(f"  - returns_matrix.parquet ({returns_matrix.shape})")
print(f"  - fundamentals.parquet ({fundamentals_df.shape})")
print(f"  - selected_tickers.csv ({len(SELECTED_TICKERS)} tickers)")