In [4]:
class constrained_clusterings_new:
    # Initialize with  AAA = constrained_clusterings_new(correlation_matrix)
    # Call with        optimal_num_clusters = AAA.calcthresholds(self,corr_threshold,pct_threshold)
    
    # Optimal number of clusters calculated via:
    # Minimum number of clusters s.t.
    #   (Percent of stocks in clusters with internal correlation less than corr_threshold) is less than pct_threshold
    
    def __init__(self,corr):
        self.corr = corr                     # correlation matrix
        self.sz = corr.shape[0]              # size
        
        self.__precompute()
        
        self.names = pd.DataFrame(corr.columns.values) # stock names
        
    def __precompute(self):
        corr_np = np.array(corr)
        
        self.clusters = np.zeros((sz,sz))    # cluster assignments of stocks
        self.numstocks = np.zeros((sz,sz))   # stocks per cluster
        self.internals = np.zeros((sz,sz))   # internal correlations of clusters
        
        for i in range(sz):
            clusters_tmp = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward').fit_predict(corr_np)
            self.clusters[i,:] = clusters_tmp
            numstocks_tmp = np.zeros(sz)
            internals_tmp = np.zeros(sz)
            for j in range(i+1):
                numstocks_tmp[j] = np.sum(clusters_tmp==j)
                tmpcorr = corr_np[clusters_tmp==j]
                tmpcorr = tmpcorr[:,clusters_tmp==j]
                internals_tmp[j] = np.mean(tmpcorr)
            self.numstocks[i,:] = numstocks_tmp
            self.internals[i,:] = internals_tmp
            
    def calcthresholds(self,corr_threshold,pct_threshold):
        tmp = np.sum((internals<corr_threshold)*(numstocks/sz),axis=1)<pct_threshold
        ret = (np.arange(0,sz)+1)[tmp]
        print("Valid numbers of clusters:")
        print(ret)
        return ret[0]

## Get Universe - Current S&P 500 stocks that exhisted in 2002

In [5]:
import os

if not os.path.exists('SP_Close.pkl'):
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)

    tickers = [s.replace('\n', '') for s in tickers]
    start = datetime(2000,1,1)
    end = datetime(2022,1,1)
    data = yf.download(tickers, start=start, end=end)

In [6]:
if not os.path.exists('SP_Close.pkl'):
    data.index = pd.to_datetime(data.index)
    data = data.sort_index()
    SP_Close = data['Adj Close']
    SP_Close=SP_Close.dropna(axis=0, how='all')
    SP_Close=SP_Close.dropna(axis=1)
else:
    SP_Close = pd.read_pickle('SP_Close.pkl')

pd.set_option('display.max_rows', 20 )
SP_Close

In [7]:
if not os.path.exists('SP_Close.pkl'):
    SP_Close.to_pickle('SP_Close.pkl')

In [8]:
print(SP_Close.dtypes)
print(SP_Close.shape)

A       float64
AAPL    float64
ABC     float64
ABMD    float64
ABT     float64
         ...   
XOM     float64
XRAY    float64
YUM     float64
ZBRA    float64
ZION    float64
Length: 367, dtype: object
(5536, 367)
