# 01 â€” Data Collection & Exploration

This notebook fetches S&P 500 stock data, fundamental data, and prepares price/returns matrices.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data_loader import get_sp500_tickers, fetch_multiple_stocks, fetch_multiple_fundamentals, build_price_matrix, build_returns_matrix

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%matplotlib inline

## 1. Fetch S&P 500 Constituents

In [None]:
sp500 = get_sp500_tickers()
print(f"Total S&P 500 stocks: {len(sp500)}")
sp500.head(10)

## 2. Select Stocks for Analysis

We select ~30 diverse stocks across sectors for a manageable analysis.

In [None]:
SELECTED_TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA", "JPM", "V", "MA", "GS", "JNJ", "UNH", "LLY", "MRK", "ABBV", "XOM", "CVX", "PG", "KO", "PEP", "WMT", "HD", "CRM", "AMD", "NFLX", "DIS", "INTC", "BA", "GE"]
print(f"Selected {len(SELECTED_TICKERS)} stocks for analysis")
print(f"Tickers: {SELECTED_TICKERS}")

## 3. Download Historical Price Data

In [None]:
stock_data = fetch_multiple_stocks(SELECTED_TICKERS, period="2y")
print(f"\nLoaded {len(stock_data)} stocks")
print(f"Date range: {list(stock_data.values())[0].index[0]} to {list(stock_data.values())[0].index[-1]}")
print(f"Sample data shape: {list(stock_data.values())[0].shape}")
list(stock_data.values())[0].head()

## 4. Download Fundamental Data

In [None]:
fundamentals_df = fetch_multiple_fundamentals(SELECTED_TICKERS)
print(f"Fundamentals shape: {fundamentals_df.shape}")
fundamentals_df.head(10)

## 5. Build Price & Returns Matrices

In [None]:
price_matrix = build_price_matrix(stock_data)
returns_matrix = build_returns_matrix(stock_data)
print(f"Price matrix: {price_matrix.shape}")
print(f"Returns matrix: {returns_matrix.shape}")
price_matrix.head()

## 6. Initial Data Exploration

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Normalize prices to base 100
sample_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'TSLA']
normalized_prices = price_matrix[sample_tickers].div(price_matrix[sample_tickers].iloc[0]) * 100
normalized_prices.plot(ax=axes[0], linewidth=2)
axes[0].set_title('Normalized Stock Prices (Base = 100)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Normalized Price')
axes[0].legend(loc='best')
axes[0].grid(alpha=0.3)

# Plot returns distribution for 3 stocks
returns_matrix[['AAPL', 'MSFT', 'GOOGL']].plot(kind='hist', bins=50, ax=axes[1], alpha=0.6, edgecolor='black')
axes[1].set_title('Daily Returns Distribution (AAPL, MSFT, GOOGL)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Daily Return')
axes[1].set_ylabel('Frequency')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig("../figures/01_price_overview.png", dpi=150, bbox_inches="tight")
plt.show()

## 7. Save Data

In [None]:
os.makedirs("../data/raw", exist_ok=True)
price_matrix.to_parquet("../data/raw/price_matrix.parquet")
returns_matrix.to_parquet("../data/raw/returns_matrix.parquet")
fundamentals_df.to_parquet("../data/raw/fundamentals.parquet")
print("Saved: price_matrix, returns_matrix, fundamentals to data/raw/")