In [6]:
#pip install yahoo_fin
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import yfinance as yf
import yahoo_fin.stock_info as si

# Calculate Centrality
def retrieve_sp500_list():
    data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    table = data[0]
    return table

def download_sector_data(symbols, start_date, end_date):
    df_sectors = yf.download(symbols, start=start_date, end=end_date)['Adj Close']
    df_sectors = df_sectors.pct_change().dropna()
    return df_sectors

def preprocess_data(df_sectors):
    scaler = StandardScaler()
    df_sectors_scaled = scaler.fit_transform(df_sectors)
    return df_sectors_scaled

def calculate_weight(market_cap):
    weights = [cap / np.sum(market_cap) for cap in market_cap]
    return weights

def calculate_weighted_df(df_sectors, market_cap):
    weights = calculate_weight(market_cap)
    sector_weights = dict(zip(df_sectors.columns, [weight * 100 for weight in weights]))

    weighted_df = df_sectors.copy()

    for sector in df_sectors.columns:
        weighted_df.loc[:, sector] *= np.sqrt(sector_weights[sector])

    return weighted_df

def calculate_centrality_score(X, n=2):
    pca_model = PCA(n_components=n)
    pca_model.fit(X)
    EV = pca_model.components_
    AR = pca_model.explained_variance_ratio_

    C_list = []
    for i in range(X.shape[1]):
        C_num = sum(AR[j] * abs(EV[j][i]) / sum(abs(EV[j][k]) for k in range(X.shape[1])) for j in range(n))
        C_denom = sum(AR[j] for j in range(n))
        C_list.append(C_num / C_denom)

    return C_list

# Calculate Relative Value

def fetch_sector_data(symbols, start_date, end_date):
    df_sectors_prices = yf.download(symbols, start=start_date, end=end_date)['Adj Close']
    df_sectors = df_sectors_prices.pct_change().dropna()
    return df_sectors

def calculate_market_cap_weights(market_cap):
    weights = market_cap / np.sum(market_cap)
    return weights

def fetch_sector_current_price(symbols):
    sector_current_price = [si.get_live_price(sector) for sector in symbols]
    return sector_current_price

def fetch_sector_200DaySMA(symbols):
    sector_200DaySMA = [si.get_data(sector, interval='1d')['close'][-200:].mean() for sector in symbols]
    return sector_200DaySMA

def normalize_current_price(current_price, SMA):
    normalize = current_price / SMA
    return normalize

def calculate_relative_measure(normalized_prices, weights):
    normalized_prices = np.nan_to_num(normalized_prices, nan=0.0)
    weighted_average = np.dot(weights, normalized_prices)
    relative_measure = np.zeros_like(normalized_prices)
    nonzero_indices = (weighted_average != 0)
    relative_measure[nonzero_indices] = normalized_prices[nonzero_indices] / weighted_average[nonzero_indices]
    return relative_measure

def create_relative_scores_df(symbols, normalized_prices, weights):
    relative_measure = calculate_relative_measure(normalized_prices, weights)
    relative_scores = dict(zip(symbols, [np.around(r, 2) for r in relative_measure]))
    for asset in relative_scores:
        relative_scores[asset] = [relative_scores[asset]]
    scores_df = pd.DataFrame(data=relative_scores, index=['Relative_score'])
    scores_df = scores_df.transpose()
    return scores_df

def combine_centrality_relative_value(weighted_df, symbols, normalized_prices, weights):
    centrality_scores = calculate_centrality_score(weighted_df)
    scores_df = create_relative_scores_df(symbols, normalized_prices, weights)

    centrality_df = pd.DataFrame(data=centrality_scores, columns=['Centrality'], index=symbols)
    relative_df = create_relative_scores_df(symbols, normalized_prices, weights)

    combined_scores_df = pd.concat([centrality_df, relative_df], axis=1)
    print(combined_scores_df)

    combined_scores_df.index.name = 'Ticker'
    return combined_scores_df

def construct_portfolios(df_sectors, symbols, normalized_prices, weights):
    combined_scores_df = combine_centrality_relative_value(weighted_df, symbols, normalized_prices, weights)

    try:
        top_crowded = combined_scores_df.sort_values(by='Centrality', ascending=False).head(4).index
        top_overvalued = combined_scores_df.sort_values(by='Relative_score', ascending=False).head(4).index
    except KeyError as e:
        print(f"Error: {e}")
        return None, None, None

    if top_crowded.empty or top_overvalued.empty:
        print("Error: Top crowded or top overvalued is empty.")
        return None, None, None

    print("Top Four Crowded Sectors:", top_crowded)
    print("Top Four Overvalued Sectors:", top_overvalued)

    next_day_returns = df_sectors.shift(-1)

    no_bubble_portfolio = combined_scores_df[~combined_scores_df.index.isin(top_crowded.union(top_overvalued))]
    bubble_run_up_portfolio = combined_scores_df[combined_scores_df.index.isin(top_crowded) & ~combined_scores_df.index.isin(top_overvalued)]
    bubble_sell_off_portfolio = combined_scores_df[combined_scores_df.index.isin(top_crowded) & combined_scores_df.index.isin(top_overvalued)]

    return no_bubble_portfolio, bubble_run_up_portfolio, bubble_sell_off_portfolio, next_day_returns

# Impute Values below:
if __name__ == "__main__":
    # Input Selected Sectors
    symbols = ['XLB', 'XLI', 'XLY', 'XLP', 'XLE', 'XLV', 'XLF', 'XLK', 'XTL', 'XLU', 'XLRE']
    # Input Market Capitalization respectively
    market_cap = [5150, 37211, 29320, 14101, 52475, 15373, 4368, 13622, 35850, 16512, 48]
    # Input the timeframe
    start_date = "2018-01-01"
    end_date = "2023-01-01"

    sp500_list = retrieve_sp500_list()
    sector_data = download_sector_data(symbols, start_date, end_date)
    scaled_data = preprocess_data(sector_data)

    df_sectors = download_sector_data(symbols, start_date, end_date)

    weighted_df = calculate_weighted_df(df_sectors, market_cap)
    weighted_scores = calculate_centrality_score(weighted_df)

    weights = calculate_market_cap_weights(market_cap)

    df_sectors = fetch_sector_data(symbols, start_date, end_date)

    sector_current_price = fetch_sector_current_price(symbols)
    sector_200DaySMA = fetch_sector_200DaySMA(symbols)

    normalized_prices = [normalize_current_price(current_price, sma) for current_price, sma in zip(sector_current_price, sector_200DaySMA)]

    relative_measure = calculate_relative_measure(normalized_prices, weights)

    scores_df = create_relative_scores_df(symbols, normalized_prices, weights)

    df_sectors.index.names = ['Date']

    no_bubble_portfolio, bubble_run_up_portfolio, bubble_sell_off_portfolio, next_day_returns = construct_portfolios(df_sectors, symbols, normalized_prices, weights)

    print("***No Bubble Portfolio***:")
    print(no_bubble_portfolio)
    print("\n***Bubble Run-up Portfolio***:")
    print(bubble_run_up_portfolio)
    print("\n***Bubble Sell-off Portfolio***:")
    print(bubble_sell_off_portfolio)

[*********************100%%**********************]  11 of 11 completed
[*********************100%%**********************]  11 of 11 completed
[*********************100%%**********************]  11 of 11 completed
      Centrality  Relative_score
XLB     0.043417            1.01
XLI     0.225386            1.03
XLY     0.117514            1.05
XLP     0.072179            0.96
XLE     0.199902            0.99
XLV     0.052685            0.99
XLF     0.037765            1.05
XLK     0.052414            1.12
XTL     0.101304            0.94
XLU     0.093062            0.96
XLRE    0.004372            1.01
Top Four Crowded Sectors: Index(['XLI', 'XLE', 'XLY', 'XTL'], dtype='object', name='Ticker')
Top Four Overvalued Sectors: Index(['XLK', 'XLY', 'XLF', 'XLI'], dtype='object', name='Ticker')
***No Bubble Portfolio***:
        Centrality  Relative_score
Ticker                            
XLB       0.043417            1.01
XLP       0.072179            0.96
XLV       0.052685            0.99
