In [11]:
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
from datetime import datetime
import matplotlib.pyplot as plt
from scipy import stats, integrate
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
import bs4 as bs
import requests
import yfinance as yf
import seaborn as sns

import cvxpy as cp

## Get Universe - Current S&P 500 stocks that exhisted in 2002

In [12]:
import os

if not os.path.exists('SP_Close.pkl'):
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)

    tickers = [s.replace('\n', '') for s in tickers]
    start = datetime(2000,1,1)
    end = datetime(2022,1,1)
    data = yf.download(tickers, start=start, end=end)

In [13]:
if not os.path.exists('SP_Close.pkl'):
    data.index = pd.to_datetime(data.index)
    data = data.sort_index()
    SP_Close = data['Adj Close']
    SP_Close=SP_Close.dropna(axis=0, how='all')
    SP_Close=SP_Close.dropna(axis=1)
else:
    SP_Close = pd.read_pickle('SP_Close.pkl')

pd.set_option('display.max_rows', 20 )
SP_Close

Unnamed: 0_level_0,A,AAPL,ABC,ABMD,ABT,ADBE,ADI,ADM,ADP,ADSK,...,WMT,WRB,WST,WY,XEL,XOM,XRAY,YUM,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,44.221344,0.854541,3.025873,18.250000,9.347846,16.274675,29.523603,6.713816,25.847561,8.052906,...,44.580296,1.278264,5.451855,12.421493,7.445602,19.671473,6.852272,4.861766,25.027779,38.801506
2000-01-04,40.843311,0.782494,2.819287,17.812500,9.080765,14.909398,28.030031,6.643880,25.847561,7.660817,...,42.912186,1.238933,5.508176,11.954857,7.617051,19.294680,6.852272,4.764044,24.666668,36.922596
2000-01-05,38.309803,0.793945,3.038025,18.000000,9.064073,15.204173,28.439224,6.538982,25.599024,7.178244,...,42.036430,1.219267,5.485648,12.588147,7.910958,20.346552,6.970727,4.788472,25.138889,36.878899
2000-01-06,36.851112,0.725238,3.268915,18.031250,9.381230,15.328291,27.661741,6.573946,25.940754,6.740914,...,42.495152,1.290063,5.429327,13.177001,7.837479,21.398418,6.979842,4.747756,23.777779,37.403233
2000-01-07,39.922039,0.759592,3.694239,17.937500,9.481387,16.072985,28.439224,6.678852,26.531029,7.540174,...,45.706272,1.270398,5.485648,12.743696,7.837479,21.335625,6.961617,4.641887,23.513889,37.490639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,158.282700,179.836319,131.471329,357.829987,140.403183,577.679993,174.617828,65.720932,245.068527,284.299988,...,139.709854,54.160683,465.444122,38.111694,66.356621,61.211414,55.700321,137.372253,606.330017,62.687782
2021-12-28,158.721420,178.799164,131.929840,357.440002,139.420593,569.359985,173.533127,66.576385,244.849625,282.769989,...,141.714783,54.566757,464.425049,38.303886,67.151077,61.013607,55.779961,137.342392,597.320007,62.787273
2021-12-29,160.187180,178.888916,132.906647,361.839996,140.135193,569.289978,174.667587,66.655968,246.779999,282.570007,...,141.645309,54.932884,467.542084,39.091873,67.488716,60.479530,56.397194,138.019257,601.119995,63.125534
2021-12-30,160.416534,177.712143,133.335266,362.059998,139.946625,570.530029,173.931168,66.506760,244.342148,281.709991,...,142.101868,54.273849,470.888885,39.370552,67.756844,60.123474,56.058052,137.750504,597.539978,62.807171


In [14]:
if not os.path.exists('SP_Close.pkl'):
    SP_Close.to_pickle('SP_Close.pkl')

In [15]:
print(SP_Close.dtypes)
print(SP_Close.shape)

A       float64
AAPL    float64
ABC     float64
ABMD    float64
ABT     float64
         ...   
XOM     float64
XRAY    float64
YUM     float64
ZBRA    float64
ZION    float64
Length: 367, dtype: object
(5536, 367)


In [52]:
from sklearn.cluster import AgglomerativeClustering

class constrained_clusterings_new:
    # Initialize with  AAA = constrained_clusterings_new(correlation_matrix)
    # Call with        optimal_num_clusters = AAA.calcthresholds(self,corr_threshold,pct_threshold)
    
    # Optimal number of clusters calculated via:
    # Minimum number of clusters s.t.
    #   (Percent of stocks in clusters with internal correlation less than corr_threshold) is less than pct_threshold
    
    def __init__(self,corr):
        self.corr = corr                     # correlation matrix
        self.sz = corr.shape[0]              # size
        
        self.__precompute()
        
        self.names = pd.DataFrame(corr.columns.values) # stock names
        
    def __precompute(self):
        sz = self.sz
        corr_np = np.array(self.corr)
        
        self.clusters = np.zeros((sz,sz))    # cluster assignments of stocks
        self.numstocks = np.zeros((sz,sz))   # stocks per cluster
        self.internals = np.zeros((sz,sz))   # internal correlations of clusters
        
        for i in range(sz):
            clusters_tmp = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward').fit_predict(corr_np)
            self.clusters[i,:] = clusters_tmp
            numstocks_tmp = np.zeros(sz)
            internals_tmp = np.zeros(sz)
            for j in range(i+1):
                numstocks_tmp[j] = np.sum(clusters_tmp==j)
                tmpcorr = corr_np[clusters_tmp==j]
                tmpcorr = tmpcorr[:,clusters_tmp==j]
                internals_tmp[j] = np.mean(tmpcorr)
            self.numstocks[i,:] = numstocks_tmp
            self.internals[i,:] = internals_tmp
            
    def calcthresholds(self,corr_threshold,pct_threshold):
        tmp = np.sum((self.internals<corr_threshold)*(self.numstocks/self.sz),axis=1)<pct_threshold
        ret = (np.arange(0,self.sz)+1)[tmp]
#         print("Valid numbers of clusters:")
#         print(ret)
        if ret.shape[0] <= 0:
            return None
        return ret[0]
    
    def get_clusters(self,cluster_number):
        if cluster_number < 1 or cluster_number > self.sz:
            print("Cluster number should be between 1 and",self.sz)
            return None
        ret = pd.DataFrame(self.names)
        ret = ret.set_index(0)
        ret['cluster'] = self.clusters[cluster_number-1,:].astype(int)
        return ret

In [53]:
def get_clusters(corr, cluster_number = 45, Thresh = None):
    if Thresh == None:
        cluster = AgglomerativeClustering(n_clusters=cluster_number, affinity='euclidean', linkage='ward')
    else:
        cluster = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',distance_threshold=Thresh)
    kclusters = cluster.fit_predict(corr)
    stock_clusters=pd.DataFrame(SP_Close.columns.values)
    stock_clusters=stock_clusters.set_index(0)
    stock_clusters['cluster']=kclusters
    return stock_clusters

In [54]:
five_year_data=SP_Close[:1265]

In [55]:
import time

start = time.time()
tmp = constrained_clusterings_new(five_year_data.corr())
print(time.time()-start)
tmp.sz

9.421419382095337


367

In [56]:
a = tmp.calcthresholds(1,0.5)
print(a)
print(tmp.get_clusters(a))

248
      cluster
0            
A         246
AAPL      233
ABC       137
ABMD      236
ABT       146
...       ...
XOM        25
XRAY        6
YUM        11
ZBRA       31
ZION       65

[367 rows x 1 columns]


In [57]:
print(get_clusters(five_year_data.corr(), cluster_number = a, Thresh = None))

      cluster
0            
A         246
AAPL      233
ABC       137
ABMD      236
ABT       146
...       ...
XOM        25
XRAY        6
YUM        11
ZBRA       31
ZION       65

[367 rows x 1 columns]


In [10]:
a = tmp.calcthresholds(1,0)
print(a)

Valid numbers of clusters:
[]
None


In [58]:
from sklearn.cluster import AgglomerativeClustering

class constrained_clusterings_new_fast:
    # Initialize with  AAA = constrained_clusterings_new(correlation_matrix)
    # Call with        optimal_num_clusters = AAA.calcthresholds(self,corr_threshold,pct_threshold)
    
    # Optimal number of clusters calculated via:
    # Minimum number of clusters s.t.
    #   (Percent of stocks in clusters with internal correlation less than corr_threshold) is less than pct_threshold
    
    def __init__(self,corr):
        self.corr = corr                     # correlation matrix
        self.sz = corr.shape[0]              # size
        
        self.__precompute()
        
        self.names = pd.DataFrame(corr.columns.values) # stock names
        
    def __precompute(self):
        sz = self.sz
        corr_np = np.array(self.corr)
        
        self.clusters = np.zeros((sz,sz))    # cluster assignments of stocks
        self.numstocks = np.zeros((sz,sz))   # stocks per cluster
        self.internals = np.zeros((sz,sz))   # internal correlations of clusters
        
        for i in range(sz):
            clusters_tmp = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward').fit_predict(corr_np)
            self.clusters[i,:] = clusters_tmp
            numstocks_tmp = np.zeros(sz)
            internals_tmp = np.zeros(sz)
            for j in range(i+1):
                numstocks_tmp[j] = np.sum(clusters_tmp==j)
                tmpcorr = corr_np[clusters_tmp==j]
                tmpcorr = tmpcorr[:,clusters_tmp==j]
                internals_tmp[j] = np.mean(tmpcorr)
            self.numstocks[i,:] = numstocks_tmp
            self.internals[i,:] = internals_tmp
            
    def calcthresholds(self,corr_threshold,pct_threshold):
        tmp = np.sum((self.internals<corr_threshold)*(self.numstocks/self.sz),axis=1)<pct_threshold
        ret = (np.arange(0,self.sz)+1)[tmp]
#         print("Valid numbers of clusters:")
#         print(ret)
        if ret.shape[0] <= 0:
            return None
        return ret[0]
    
    def get_clusters(self,cluster_number):
        if cluster_number < 1 or cluster_number > self.sz:
            print("Cluster number should be between 1 and",self.sz)
            return None
        ret = pd.DataFrame(self.names)
        ret = ret.set_index(0)
        ret['cluster'] = self.clusters[cluster_number-1,:].astype(int)
        return ret

In [63]:
from sklearn.cluster import AgglomerativeClustering
#rule is to select number of clusters clusters such that at most stock_pct_thresh of stocks are in clusters 
#with an internal corr of corr_thresh

def constrained_clusters(corr, corr_threshold = .7, pct_threshold = .1,verbose=False, min_clusters = None):
    sz = corr.shape[0]
    corr_np = np.array(corr)
    names = pd.DataFrame(corr.columns.values) # stock names

    clusters = np.zeros((sz,sz))    # cluster assignments of stocks
    numstocks = np.zeros((sz,sz))   # stocks per cluster
    internals = np.zeros((sz,sz))   # internal correlations of clusters

    cluster_number = None
    
    for i in range(sz):
        clusters_tmp = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward').fit_predict(corr_np)
        clusters[i,:] = clusters_tmp
        numstocks_tmp = np.zeros(sz)
        internals_tmp = np.zeros(sz)
        for j in range(i+1):
            numstocks_tmp[j] = np.sum(clusters_tmp==j)
            tmpcorr = corr_np[clusters_tmp==j]
            tmpcorr = tmpcorr[:,clusters_tmp==j]
            internals_tmp[j] = np.mean(tmpcorr)
        numstocks[i,:] = numstocks_tmp
        internals[i,:] = internals_tmp
        
        tmp = np.sum((internals_tmp<corr_threshold)*(numstocks_tmp/sz))<pct_threshold
        if tmp:
            cluster_number = (np.arange(0,sz)+1)[i]
            break
            
    if not cluster_number:
        # Unable to generate clusters with given corr_, pct_ thresholds
        return None
    ret = pd.DataFrame(names)
    ret = ret.set_index(0)
    ret['cluster'] = clusters_tmp.astype(int)
    return ret

In [66]:
start = time.time()
print(constrained_clusters(five_year_data.corr(),1,0.5))
print(time.time()-start)

      cluster
0            
A         246
AAPL      233
ABC       137
ABMD      236
ABT       146
...       ...
XOM        25
XRAY        6
YUM        11
ZBRA       31
ZION       65

[367 rows x 1 columns]
6.059909105300903
