In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the HTML content from the URL
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components"
response = requests.get(url)
html_content = response.text

# Function to get the list of current S&P 500 companies
def get_current_companies(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='constituents')
    companies = []
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        company = cols[0].text.strip()
        companies.append(company)
    return companies

# Get the list of current S&P 500 tickers
current_tickers = get_current_companies(html_content)

# Display the results
print("Current S&P 500 tickers:")
print('Number of companies:', len(current_tickers))
print(current_tickers)

Current S&P 500 tickers:
Number of companies: 503
['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BK', 'BBWI', 'BAX', 'BDX', 'BRK.B', 'BBY', 'BIO', 'TECH', 'BIIB', 'BLK', 'BX', 'BA', 'BKNG', 'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF.B', 'BLDR', 'BG', 'CDNS', 'CZR', 'CPT', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CTLT', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CNP', 'CF', 'CHRW', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CAG', 'COP', 'ED', 'STZ', 'CEG', 'COO', 'CPRT', 'GLW', 'CPAY', 'CTVA

In [64]:
# Function to get the selected changes
def get_selected_changes(html):
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table', class_='wikitable')
    changes = []
    for table in tables:
        # Check if the table is under the "Selected changes to the list of S&P 500 components" section
        if table.find_previous('h2').find('span', id='Selected_changes_to_the_list_of_S&P_500_components'):
            for row in table.find_all('tr')[1:]:  # Skip header row
                cols = row.find_all('td')
                if len(cols) >= 4:
                    date = cols[0].text.strip()
                    ticker_added = cols[1].text.strip()
                    ticker_removed = cols[3].text.strip()
                    changes.append({
                        'date': date,
                        'ticker_added': ticker_added,
                        'ticker_removed': ticker_removed
                    })
            break
    return changes

# Get the selected changes
selected_changes = get_selected_changes(html_content)

# Convert the selected changes to a DataFrame
selected_changes_df = pd.DataFrame(selected_changes)

# Convert date format from 'Month Day, Year' to 'dd/mm/yyyy'
selected_changes_df['date'] = pd.to_datetime(selected_changes_df['date']).dt.strftime('%d/%m/%Y')

# Display the DataFrame
print("\nSelected Changes DataFrame:")

# Convert to datetime, extract the start of the quarter, and format back to the desired format
selected_changes_df['quarter'] = pd.to_datetime(selected_changes_df['date'], format='%d/%m/%Y').dt.to_period('Q').dt.start_time.dt.strftime('%d/%m/%Y')
# Convert quarter to add tipe of 'Q1 2024', 'Q4 2023', 'Q3 2023', 'Q2 2023', 'Q1 2023'
selected_changes_df['quarter name'] = pd.to_datetime(selected_changes_df['date'], format='%d/%m/%Y').dt.to_period('Q').dt.strftime(' %Y Q%q')

selected_changes_df


Selected Changes DataFrame:


Unnamed: 0,date,ticker_added,ticker_removed,quarter,quarter name
0,08/05/2024,VST,PXD,01/04/2024,2024 Q2
1,03/04/2024,,XRAY,01/04/2024,2024 Q2
2,03/04/2024,,VFC,01/04/2024,2024 Q2
3,02/04/2024,GEV,,01/04/2024,2024 Q2
4,01/04/2024,SOLV,,01/04/2024,2024 Q2
...,...,...,...,...,...
340,09/06/1999,WLP,HPH,01/04/1999,1999 Q2
341,11/12/1998,FSR,LDW,01/10/1998,1998 Q4
342,11/12/1998,CCL,GRN,01/10/1998,1998 Q4
343,11/12/1998,CPWR,SUN,01/10/1998,1998 Q4


In [146]:
# Test multiculiniarity between the indicators
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

# Function to calculate the VIF for each variable
def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["feature"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    return vif_data

def drop_high_vif(data):
    data = data.replace([np.inf, -np.inf], np.nan).dropna()
    count = 0
    total_columns = len(data.columns)
    data = data.copy()  # Work on a copy of the data
    while True:
        vif = calculate_vif(data)
        max_vif = vif['VIF'].max()
        if max_vif > 10:
            max_vif_column = vif.loc[vif['VIF'].idxmax(), 'feature']
            print(f"Dropping column '{max_vif_column}' with VIF of {max_vif}")
            data.drop(columns=max_vif_column, inplace=True)
            count += 1
        else:
            break
    print(f"\nTotal columns dropped: {count} out of {total_columns}")
    return data

In [143]:
import pandas as pd
import pandas_datareader.data as web
from datetime import datetime

def fetch_quarterly_indicators(start_date='2000-01-01', end_date=datetime.today().strftime('%Y-%m-%d')):
    # Define the indicators and their corresponding FRED codes
    indicators = {
        'GDP': 'GDP',
        'Unemployment Rate': 'UNRATE',
        'CPI': 'CPIAUCSL',
        'PPI': 'PPIACO',
        'Federal Funds Rate': 'FEDFUNDS',
        '10-Year Treasury Yield': 'GS10',
        'Consumer Confidence Index': 'UMCSENT',
        'Retail Sales': 'RSAFS',
        'Housing Starts': 'HOUST',
        'Durable Goods Orders': 'DGORDER',
        'Industrial Production': 'INDPRO',
        'Trade Balance': 'BOPGSTB',
        'Personal Income': 'PI',
        'Business Inventories': 'BUSINV'
    }
    
    # Create an empty DataFrame to store the results
    data = pd.DataFrame()
    
    # Fetch data for each indicator
    for indicator, fred_code in indicators.items():
        df = web.DataReader(fred_code, 'fred', start_date, end_date)
        df = df.resample('Q').mean()  # Resample to quarterly frequency and take the mean
        df.rename(columns={fred_code: indicator}, inplace=True)
        data = pd.concat([data, df], axis=1)
    
    return data

# Fetch quarterly data since 2000
economical_indicators = fetch_quarterly_indicators()

# Add quarter index in the format 'Q1 2024'
economical_indicators['quarter'] = economical_indicators.index.to_period('Q').strftime('%Y Q%q')
# Set the quarter as the index
economical_indicators.set_index('quarter', inplace=True)

# Calculate quarter-over-quarter changes
economical_indicators = economical_indicators.pct_change() * 100  # Convert to percentage

In [147]:
# Drop columns with high VIF values
economical_indicators = drop_high_vif(economical_indicators)

# Display the results
print("Economical Indicators DataFrame:")
economical_indicators

Dropping column 'Industrial Production' with VIF of 11.973764726642306

Total columns dropped: 1 out of 14
Economical Indicators DataFrame:


Unnamed: 0_level_0,GDP,Unemployment Rate,CPI,PPI,Federal Funds Rate,10-Year Treasury Yield,Consumer Confidence Index,Retail Sales,Housing Starts,Durable Goods Orders,Trade Balance,Personal Income,Business Inventories
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000 Q2,2.454875,-2.479339,0.783853,1.851376,10.510863,-4.681070,-1.240920,0.074332,-4.360056,4.501897,0.784161,1.680865,1.706805
2000 Q3,0.687421,1.694915,0.913864,1.312800,3.931987,-4.587156,-1.195219,0.886670,-5.231092,-4.218298,7.199891,1.895193,1.324908
2000 Q4,1.139534,-2.500000,0.712909,1.320708,-0.715746,-5.542986,-3.287841,0.686554,2.682332,-2.376190,5.295400,1.189201,1.150361
2001 Q1,0.330470,8.547009,0.956572,1.647811,-13.594233,-9.281437,-11.225144,0.640726,3.950777,-4.372139,-2.216618,1.815397,-0.202046
2001 Q2,1.229858,3.937008,0.701156,-1.112993,-22.646007,4.356436,-1.372832,1.036860,1.557632,-2.519178,-10.839028,0.412239,-1.003856
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023 Q2,0.930166,1.904762,0.751542,-1.484616,10.479705,-1.462523,-3.560372,-0.116640,6.306306,3.951902,1.000255,0.989966,-0.271092
2023 Q3,2.021638,3.738318,0.846112,0.786329,5.410822,15.491651,11.663991,1.470399,-5.153459,-0.792028,-7.968348,0.951838,0.195822
2023 Q4,1.256314,0.900901,0.674652,-1.626973,1.330798,6.987952,-6.660278,0.803480,7.292924,0.583508,3.326751,0.700867,0.242440
2024 Q1,1.069249,1.785714,0.938227,0.417725,0.000000,-6.306306,20.790554,-0.129899,-5.266712,-3.285712,7.493751,1.739829,0.359459


In [152]:
import pandas as pd
from fredapi import Fred

# Replace 'your_api_key' with your actual FRED API key
fred = Fred(api_key='6e1efd417db3c8b5678331eb083b3d3a')

# Function to search for indicators
def search_indicators(keyword, limit=100):
    search_results = fred.search(keyword, order_by='popularity', limit=limit)
    return search_results

# Search for indicators related to 'social'
keyword = 'social'
indicators = search_indicators(keyword)

all_monthly_indicators = indicators[indicators['frequency'] == 'Monthly'][['title', 'id']]
# Use all monthly indicators
indicators_dict = all_monthly_indicators.set_index('title').to_dict()['id']

# Dictionary to store dataframes for each indicator
data_frames = {}

# Function to fetch data for a given indicator
def fetch_data(series_id):
    # If the series ID is not found, continue to the next indicator
    try:
        data = fred.get_series(series_id)
    except:
        return None
    data = data.resample('Q').mean()  # Resample to quarterly data
    return data

# Fetch data for each indicator and store in the dictionary
for indicator, series_id in indicators_dict.items():
    data = fetch_data(series_id)
    if data is not None:
        data_frames[indicator] = data

# Combine dataframes into a single dataframe
combined_df = pd.concat(data_frames, axis=1)

# Calculate quarter-to-quarter change
social_indicators = combined_df.pct_change() * 100  # Convert to percentage
# social_indicators = combined_df.diff().dropna() 

# Slice the dataframe to only include data from 2000 onwards
social_indicators = social_indicators['2000':]

social_indicators['quarter'] = social_indicators.index.to_period('Q').strftime('%Y Q%q')
social_indicators.set_index('quarter', inplace=True)

In [153]:
# Drop columns with high VIF values
social_indicators = drop_high_vif(social_indicators)

print("Social Indicators DataFrame:")
social_indicators.head()

Dropping column 'Indexes of Aggregate Weekly Payrolls of Production and Nonsupervisory Employees, Total Private' with VIF of 27733.941217416217
Dropping column 'Average Weekly Earnings of Production and Nonsupervisory Employees, Manufacturing' with VIF of 397.3455702958243
Dropping column 'Average Weekly Earnings of Production and Nonsupervisory Employees, Total Private' with VIF of 268.26128186526233
Dropping column 'Average Hourly Earnings of Production and Nonsupervisory Employees, Goods-Producing' with VIF of 123.5553144586158
Dropping column 'Personal current transfer receipts: Government social benefits to persons' with VIF of 95.79181091547315
Dropping column 'Average Hourly Earnings of Production and Nonsupervisory Employees, Manufacturing' with VIF of 50.250329234750126
Dropping column 'All Employees, Health Care and Social Assistance' with VIF of 39.217110203687895
Dropping column 'Production and Nonsupervisory Employees, Total Private' with VIF of 28.278422915345445
Dropping

Unnamed: 0_level_0,Personal Income,"Average Weekly Hours of Production and Nonsupervisory Employees, Manufacturing",Household Estimates,Business Applications: Retail Trade in the United States,Business Applications: Total for All NAICS in the United States,"Average Weekly Hours of Production and Nonsupervisory Employees, Total Private",Personal current transfer receipts: Government social benefits to persons: Social security,"Average Hourly Earnings of Production and Nonsupervisory Employees, Durable Goods",Job Openings: Health Care and Social Assistance,"Average Hourly Earnings of Production and Nonsupervisory Employees, Construction",...,Quits: Health Care and Social Assistance,"Average Hourly Earnings of Production and Nonsupervisory Employees, Leisure and Hospitality",Personal current transfer receipts: Government social benefits to persons: Medicaid,Harmonized Index of Consumer Prices: Actual Rentals for Housing for Euro Area (18 countries),Business Applications: Transportation and Warehousing in the United States,"Average Hourly Earnings of Production and Nonsupervisory Employees, Retail Trade","Average Hourly Earnings of Production and Nonsupervisory Employees, Transportation and Warehousing",M1 for Turkey,Hires: Health Care and Social Assistance,Layoffs and Discharges: Health Care and Social Assistance
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004 Q4,2.084264,-0.490597,0.632232,9.592754,14.836085,0.197824,1.076448,1.485443,0.845921,0.294832,...,-0.268817,1.055697,2.16235,0.548141,5.168964,0.826446,0.344269,3.604737,1.11465,-22.506394
2005 Q1,0.084274,0.082169,0.194956,0.061821,16.637336,-0.098717,3.072853,0.468384,7.130018,0.121044,...,6.06469,0.396254,2.331636,0.466145,5.032512,0.901639,0.161453,0.370556,5.354331,24.422442
2005 Q2,1.608286,-0.492611,0.087544,1.284014,-2.991537,0.0,1.256992,0.407925,4.753915,0.466321,...,3.684879,0.574094,3.130185,0.46005,3.945072,0.406174,0.120895,13.419358,2.167414,3.183024
2005 Q3,1.566255,0.330033,0.53964,-5.019228,-14.926648,0.0,0.233979,0.69646,1.494928,0.567303,...,-1.22549,0.749197,-3.432003,0.53231,-1.051798,0.080906,0.804991,9.869985,-0.950988,-4.627249
2005 Q4,1.78939,0.904605,0.420849,14.751832,11.440775,0.197628,0.667877,1.229587,-7.101526,0.358974,...,-2.233251,0.566572,1.754776,0.408799,12.192233,0.0,0.53903,30.58759,-2.880355,-18.598383
