[Reference](https://wire.insiderfinance.io/clustering-stock-price-data-26b097d1ae53)

In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from concurrent import futures
import numpy as np
import pandas_datareader.data as web
from scipy.stats import gaussian_kde
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
data_dir = "./data/stock_data_for_clustering"
os.makedirs(data_dir, exist_ok=True)

# Download Dataset(concurrent processing)


In [3]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
first_table = tables[0]
print(first_table.shape)
first_table["Symbol"] = first_table["Symbol"].map(lambda x: x.replace(".", "-"))  # rename symbol to escape symbol error
sp500_tickers = list(first_table["Symbol"])
first_table.head()

(504, 9)


Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981


In [7]:
def download_stock(stock):
    try:
        print(stock)
        stock_df = web.DataReader(stock,'yahoo', start_time, end_time)
        stock_df['Name'] = stock
        output_name = f"{data_dir}/{stock}.csv"
        stock_df.to_csv(output_name)
    except:
        bad_names.append(stock)
        print('bad: %s' % (stock))
""" set the download window """
start_time = dt.datetime(2021, 12, 1)
end_time = dt.datetime(2022, 3, 1)
bad_names =[] #to keep track of failed queries
#set the maximum thread number
max_workers = 20
now = dt.datetime.now()
workers = min(max_workers, len(sp500_tickers)) #in case a smaller number of stocks than threads was passed in
with futures.ThreadPoolExecutor(workers) as executor:
    res = executor.map(download_stock, sp500_tickers)
""" Save failed queries to a text file to retry """
if len(bad_names) > 0:
    with open(f'{data_dir}/failed_queries.txt','w') as outfile:
        for name in bad_names:
            outfile.write(name+'\n')
finish_time = dt.datetime.now()
duration = finish_time - now
minutes, seconds = divmod(duration.seconds, 60)
print(f'The threaded script took {minutes} minutes and {seconds} seconds to run.')
print(f"{len(bad_names)} stocks failed: ", bad_names)

MMM
AOS
ABT
ABBV
ABMD
ACN
ATVI
ADM
ADBE
ADP
AAP
AES
AFL
A
AIG
APD
AKAM
ALKALB

ARE
bad: ABBV
ALGN
bad: ACN
bad: AFL
ALLE
LNT
bad: MMMbad: AOS
ALL

bad: ALB
bad: A
bad: APD
GOOGL
GOOG
bad: AAP
bad: ATVI
MO
AMZN
AMCR
bad: ABMD
bad: ADMbad: ADBE
AMD
AEE

bad: ADPAAL
bad: AIG
bad: AES
AEP
AXP

bad: ALK
AMTbad: ABT
AWK
AMP

ABC
bad: ARE
bad: AKAM
AMEAMGN

bad: ALGNbad: ALLE

APH
ADI
bad: GOOGLbad: GOOG

ANSSANTM

bad: ALL
AON
bad: AAL
bad: AMCR
APA
AAPLbad: LNT

AMAT
bad: AEE
bad: AMZN
bad: AMPAPTV
ANET
bad: AMT
bad: AMGN
bad: MO
AIZ

TATO
bad: AWK
ADSK
bad: AXP
AZO

AVB
bad: AMD
bad: AEPAVY

bad: AME
BKRbad: ABCBLL

bad: ADI
BAC

BBWI
bad: AON
BAX
bad: ANTMbad: APH
BDX
bad: ANSS

WRB
BRK-B
bad: APA
BBY
bad: ANET
bad: AMATBIO

bad: APTV
TECH
bad: AIZbad: AAPL
BIIB

bad: ATOBLK
BK

BAbad: AVB

BKNG
bad: AZO
bad: ADSKBWA
bad: BLL
bad: T
BXP

bad: BKR
BSX
BMY
AVGO
bad: BAC
BR
bad: BBWI
bad: BRK-B
bad: BDXbad: BAX
BRO
BF-B

bad: TECHbad: BBY
CHRW
CDNS
CZR

CPT
bad: AVY
CPB
bad: WRB
COF
bad: BII

# Preprocessing data


In [9]:
historical_stock_data_files = glob.glob(f"./{data_dir}/*.csv")
reference_day = "2021-12-31"
start_day = "2022-01-03"
midterm_day = "2022-01-18"
end_day = "2022-01-31"
price_change_list = []
tickers_to_ignore = []
for files in historical_stock_data_files:
    df = pd.read_csv(files, index_col=["Date"])
    ticker = os.path.splitext(os.path.basename(files))[0]
    try:
        price_close = df[reference_day: end_day][["Close"]]
        price_change = (price_close / price_close.loc[reference_day, "Close"] - 1) * 100
        price_change = price_change.iloc[1: ,:]
        price_change = price_change.rename(columns={"Close": ticker})
        price_change_list.append(price_change)
    except KeyError as e:
        # some stocks started trading after 2021-12-31
        print(ticker)
        tickers_to_ignore.append(ticker)
    
df = pd.concat(price_change_list, axis=1)
print(df.shape)
df.head()

In [10]:
data_1 = df.loc[[end_day], :]
display(data_1.head())
data_2 = df.loc[[start_day, midterm_day, end_day], :]
display(data_2.head())
data_3 = df
display(data_3.head())

# Exploratory Data Analysis(EDA)

In [11]:
first_table = first_table[~first_table["Symbol"].isin(tickers_to_ignore)]
industry_list = list(first_table["GICS Sector"].unique())
performance_by_industry = [data_1.loc[:, first_table[first_table["GICS Sector"]==x]["Symbol"]].values[0] for x in industry_list]
plt.boxplot(performance_by_industry, labels=industry_list)
plt.xticks(rotation=90)
plt.xlabel("sector")
plt.ylabel(f"performance(%) at {end_day}")
plt.show()

In [12]:
examples = ["AAPL", "TSLA", "BMY"]
for x in examples:
    plt.plot(data_3[x], label=x)
plt.xticks(rotation=90)
plt.ylabel("performance(%)")
plt.legend(loc="upper right")
plt.show()

# Clustering


## One-dimentional data


In [13]:
# Kernel density estimation
x = np.linspace(-35, 35, 1000)
kde = gaussian_kde(data_1.values)
y = kde(x)
plt.hist(data_1.loc[end_day, :], bins=20, density=True)
plt.plot(x, y)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()

In [14]:
plt.hist(data_1[data_1 < 0].loc[end_day, :], bins=10)
plt.hist(data_1[data_1 >= 0].loc[end_day, :], bins=10)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()

In [15]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_1.T.values)
for i in range(n_clusters):
    plt.hist(data_1.loc[end_day, kmeans.labels_ == i], bins=8)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()

In [16]:
dummy_zero = np.random.normal(0,1,100)
dummy_minus = np.random.normal(-8,1,100)
dummy_plus = np.random.normal(8,1,100)
dummy = np.concatenate([dummy_zero, dummy_minus, dummy_plus])
plt.hist(dummy, bins=25)
plt.xlabel("performance(%)")
plt.show()

In [17]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(dummy.reshape(-1,1))
for i in range(n_clusters):
    plt.hist(dummy[kmeans.labels_ == i], bins=8)
plt.xlabel("performance(%)")
plt.show()

## Two-dimentional data


In [18]:
def scatter_hist(x, y, ax, ax_histx, ax_histy):
    # no labels
    ax_histx.tick_params(axis="x", labelbottom=False)
    ax_histy.tick_params(axis="y", labelleft=False)
# the scatter plot:
    ax.scatter(x, y, alpha=0.5)
# now determine nice limits by hand:
    binwidth = 0.25
    xymax = max(np.max(np.abs(x)), np.max(np.abs(y)))
    lim = (int(xymax/binwidth) + 1) * binwidth
bins = np.arange(-lim, lim + binwidth, binwidth)
    ax_histx.hist(x, bins=bins)
    ax_histy.hist(y, bins=bins, orientation='horizontal')
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2,  width_ratios=(7, 2), height_ratios=(2, 7),
                      left=0.1, right=0.9, bottom=0.1, top=0.9,
                      wspace=0.05, hspace=0.05)
ax = fig.add_subplot(gs[1, 0])
ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)
scatter_hist(data_2.loc[midterm_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy)
ax.set_xlabel(f"performance(%) at {midterm_day}")
ax.set_ylabel(f"performance(%) at {end_day}")
plt.show()

In [19]:
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2,  width_ratios=(7, 2), height_ratios=(2, 7),
                      left=0.1, right=0.9, bottom=0.1, top=0.9,
                      wspace=0.05, hspace=0.05)
ax = fig.add_subplot(gs[1, 0])
ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)
scatter_hist(data_2.loc[start_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy)
ax.set_xlabel(f"performance(%) at {start_day}")
ax.set_ylabel(f"performance(%) at {end_day}")
plt.show()

In [20]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_2.T.values)
for i in range(n_clusters):
    plt.scatter(data_2.loc[midterm_day, kmeans.labels_ == i], data_2.loc[end_day, kmeans.labels_ == i], alpha=0.5)
plt.xlabel(f"performance(%) at {midterm_day}")
plt.ylabel(f"performance(%) at {end_day}")
plt.show()

In [21]:
result = linkage(data_2.loc[midterm_day:, :].T.values, metric='euclidean', method='average')
color_list = ["orange", "green"]
set_link_color_palette(color_list)
dendrogram(result)
plt.title("Dedrogram")
plt.ylabel("Distance")
plt.show()

In [22]:
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
    plt.scatter(data_2.loc[midterm_day, labels == x],
        data_2.loc[end_day, labels == x],
        alpha=0.5, color=color_list[i], label=f"Cluster {x}")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()

In [23]:
cluster_1 = data_2.loc[:, labels == 1].columns
sector_counts = first_table[first_table["Symbol"].isin(cluster_1)]["GICS Sector"].value_counts()
sectors = sector_counts.index
plt.figure(figsize=(5,3))
plt.bar(sectors, sector_counts)
plt.xticks(rotation=90)
plt.show()

In [24]:
cluster_2 = data_2.loc[:, labels == 2].columns
sector_counts = first_table[first_table["Symbol"].isin(cluster_2)]["GICS Sector"].value_counts()
sectors = sector_counts.index
plt.figure(figsize=(5,3))
plt.bar(sectors, sector_counts)
plt.xticks(rotation=90)
plt.show()

## Three-dimentional data


In [25]:
result = linkage(data_2.T.values, metric='euclidean', method='average')
color_list = ["orange", "green", "red"]
set_link_color_palette(color_list)
dendrogram(result)
plt.title("Dedrogram")
plt.ylabel("Distance")
plt.show()

In [26]:
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
    plt.scatter(data_2.loc[midterm_day, labels == x],
        data_2.loc[end_day, labels == x],
        alpha=0.5, color=color_list[i], label=f"Cluster {x}")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()

In [27]:
centroids_list = []
for x in set(labels):
    # print("Cluster:", x)
    centroid = data_2.loc[:, labels == x].mean(axis=1)
    # centroid.rename() = [x]
    centroid = pd.DataFrame(centroid, columns=[x])
    centroids_list.append(centroid)
centroids_df = pd.concat(centroids_list, axis=1)
print("Centroids of each cluster.")
display(centroids_df)
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
    plt.scatter(data_2.loc[midterm_day, labels == x],
        data_2.loc[end_day, labels == x],
        alpha=0.5, color=color_list[i], label=f"Cluster {x}")
    plt.scatter(centroids_df.loc[midterm_day, x],
    centroids_df.loc[end_day, x], color="black", marker="*")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()

## 20-dimentional data


In [28]:
tsne = TSNE(n_components=2, random_state=41)
X_reduced = tsne.fit_transform(data_3.T)
print(X_reduced.shape)
plt.figure(figsize=(13, 7))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], alpha=0.5)
plt.show()

In [29]:
plt.figure(figsize=(13, 7))
for x in set(labels):
    plt.scatter(X_reduced[labels==x, 0], X_reduced[labels==x, 1], alpha=0.8, label=f"Cluster {x}")
plt.legend()
plt.show()

In [30]:
clustering = DBSCAN(eps=1.7, min_samples=3).fit(X_reduced)
labels_dbscan = clustering.labels_
print(set(labels_dbscan))
plt.figure(figsize=(13, 7))
for x in set(labels_dbscan):
    plt.scatter(X_reduced[labels_dbscan==x, 0], X_reduced[labels_dbscan==x, 1], alpha=0.8, label=f"Cluster {x}")
plt.legend()
plt.show()

In [31]:
centroids_list = []
for x in set(labels_dbscan):
    # print("Cluster:", x)
    centroid = data_3.loc[:, labels_dbscan == x].mean(axis=1)
    # centroid.rename() = [x]
    centroid = pd.DataFrame(centroid, columns=[x])
    centroids_list.append(centroid)
centroids_df = pd.concat(centroids_list, axis=1)
print("Centroids of each cluster.")
# For ease of thinking, only 3 days are shown.
display(centroids_df.loc[[start_day, midterm_day, end_day], :])
for x in range(5):
    plt.plot(centroids_df.loc[:, x], label=f"Cluster {x}")
plt.legend()
plt.xticks(rotation=90)
plt.legend(loc="lower left")
plt.ylim(-20,20)
plt.ylabel("performance(%)")
plt.show()

In [32]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15,12))
for i, x in enumerate(set(labels_dbscan)):
    ax = axes[i%3][i//3]
    if x == -1:
        ax.axis('off')
        continue
    ax.plot(data_3.loc[:, labels_dbscan==x])
    ax.set_title(f"Cluster {x}")
    ax.set_ylim(-40, 40)
    ax.get_xaxis().set_visible(False)
    ax.grid(axis='y',linestyle='dotted', color='b')

In [33]:
for x in set(labels_dbscan):
    if x == -1:
        continue
    print(f"Cluster {x}")
    print(list(data_3.loc[:, labels_dbscan==x].T.sample(5).index))
# Cluster 0
# ['LRCX', 'EXR', 'NDAQ', 'ROK', 'CDW']
# Cluster 1
# ['TDY', 'UAL', 'ILMN', 'MMM', 'AIZ']
# Cluster 2
# ['APA', 'BKR', 'HAL', 'VTRS', 'WFC']
# Cluster 3
# ['COF', 'VNO', 'CCL', 'CPB', 'AIG']
# Cluster 4
# ['QCOM', 'HAS', 'ALB', 'SIVB', 'TEL']