In [None]:
from math import sqrt

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
plt.rc("font", size=10)

from openbb_terminal.sdk import openbb

In [None]:
# Code creates list of dow jones symbols
dji = (
    pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')[1]
)
dji_symbols = dji.Symbol.tolist()
dji_symbols.remove('DOW')

dji_data = openbb.stocks.ca.hist(
    dji_symbols, 
    start_date="2016-01-01",
    end_date="2020-01-01"
)

In [None]:
# Code creates list of sp500 symbols
sp500 = (
    pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
)
sp500_symbols = sp500.Symbol.tolist()
sp500_symbols.remove('BRK.B')
sp500_symbols.remove('BF.B')
sp500_symbols.remove('CARR')
sp500_symbols.remove('CDAY')
sp500_symbols.remove('CEG')
sp500_symbols.remove('CTVA')
sp500_symbols.remove('DOW')
sp500_symbols.remove('FTV')
sp500_symbols.remove('FOXA')
sp500_symbols.remove('FOX')
sp500_symbols.remove('GEHC')
sp500_symbols.remove('HWM')
sp500_symbols.remove('IR')
sp500_symbols.remove('INVH')
sp500_symbols.remove('LW')
sp500_symbols.remove('MRNA')
sp500_symbols.remove('OGN')
sp500_symbols.remove('OTIS')
sp500_symbols.remove('ROP')
sp500_symbols.remove('VICI')


sp500_data = openbb.stocks.ca.hist(
    sp500_symbols, 
    start_date="2016-01-01",
    end_date="2020-01-01"
)

In [None]:
# Code creates list of sp100 symbols
sp100 = (
    pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')[2]
)
sp100_symbols = sp100.Symbol.tolist()
sp100_symbols.remove('BRK.B')
sp100_symbols.remove('DOW')
sp100_symbols.append('CP')

sp100_data = openbb.stocks.ca.hist(
    sp100_symbols, 
    start_date="2016-01-01",
    end_date="2020-01-01"
)

In [None]:
moments = (
    sp100_data
    .pct_change()
    .describe()
    .T[["mean", "std"]]
    .rename(columns={"mean": "returns", "std": "vol"})
) * [252, sqrt(252)]

In [None]:
sse = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(moments)
    sse.append(kmeans.inertia_)

plt.plot(range(2, 15), sse)
plt.title("Elbow Curve");

In [None]:
kmeans = KMeans(n_clusters=3, n_init=11).fit(moments)
plt.scatter(
    moments.returns, 
    moments.vol, 
    c=kmeans.labels_, 
    cmap="rainbow",
);

asset_groups = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    5: [],
    6: [],
    7: [],
    8: [],
    9: [],
    10: [],
    11: [],
    12: [],
    13: [],
    14: [],
    15: [],
    16: [],
    17: [],
    18: [],
    19: [],
    20: []
}

plt.title("Dow Jones stocks by return and volatility (K=6)")
for i in range(len(moments.index)):
    if(moments.index[i] == "BLK"):
        txt = f"{moments.index[i]} ({kmeans.labels_[i]})"
    else:
        txt = f""
    asset_groups[int(kmeans.labels_[i])].append(moments.index[i])
    xy = tuple(moments.iloc[i, :] + [0, 0.01])
    plt.annotate(txt, xy)
    

In [None]:
# Prints a list of tickers that belong to a specific group
print(asset_groups[1])