In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.optimize import minimize
from datetime import datetime, timedelta

In [3]:
import numpy as np
import scipy
import sklearn

print("Numpy version:", np.__version__)
print("Scipy version:", scipy.__version__)
print("Scikit-learn version:", sklearn.__version__)


Numpy version: 1.24.3
Scipy version: 1.10.1
Scikit-learn version: 1.3.0


In [4]:
# ==========================
# Step 1: Data Collection (Live Data Integration)
# ==========================

def fetch_sp500_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp500_table = pd.read_html(url)[0]
    return sp500_table[['Symbol', 'GICS Sector']]

def fetch_stock_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)['Adj Close']
    return data.dropna(axis=1)

sp500_companies = fetch_sp500_tickers()
tickers = sp500_companies['Symbol'].tolist()

start_date = "2018-01-01"
end_date = datetime.today().strftime('%Y-%m-%d')  # Get latest available data
data = fetch_stock_data(tickers, start_date, end_date)

[**********************93%********************   ]  467 of 503 completedCould not get exchangeTimezoneName for ticker 'ETR' reason: 'chart'
[*********************100%***********************]  503 of 503 completed

3 Failed downloads:
['BRK.B', 'ETR']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
['BF.B']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2018-01-01 -> 2025-02-09)')


In [5]:
# ==========================
# Step 2: Feature Engineering (Risk, Liquidity & Macro)
# ==========================

returns = data.pct_change().dropna()
volatility = returns.rolling(window=20).std().dropna()

def compute_beta(stock_returns, market_returns):
    covariance = np.cov(stock_returns, market_returns)[0, 1]
    market_variance = np.var(market_returns)
    return covariance / market_variance

sp500_returns = returns.mean(axis=1)  # Approximate S&P 500 market return

features = pd.DataFrame({
    "mean_return": returns.mean() * 252,
    "volatility": volatility.mean(),
    "cumulative_return": (data.iloc[-1] / data.iloc[0]) - 1,
    "sharpe_ratio": returns.mean() / returns.std(),
    "sortino_ratio": returns.mean() / returns[returns < 0].std(),
    "max_drawdown": (data / data.cummax() - 1).min(),
    "var_95": returns.quantile(0.05),
    "beta": [compute_beta(returns[ticker], sp500_returns) for ticker in returns.columns],
    "liquidity": data.mean(),  # Approximate by average price
})

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

ValueError: Found array with 0 sample(s) (shape=(0, 9)) while a minimum of 1 is required by StandardScaler.