# Data Collection and Preprocessing

This notebook downloads stock data using yfinance and preprocesses it for portfolio analysis.

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define tickers (e.g., S&P 500 components or a subset)
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'NVDA', 'META', 'NFLX', 'BABA', 'ORCL']

# Download historical data
start_date = '2018-01-01'
end_date = '2023-01-01'
data = yf.download(tickers, start=start_date, end=end_date)

# Focus on adjusted close prices
prices = data['Adj Close']

# Calculate returns
returns = prices.pct_change().dropna()

# Handle missing values
returns = returns.fillna(method='ffill').fillna(0)

# Normalize returns
scaler = StandardScaler()
returns_scaled = pd.DataFrame(scaler.fit_transform(returns), index=returns.index, columns=returns.columns)

print('Data shape:', returns.shape)
returns.head()