In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf
import pickle
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

#Test

pd.set_option('display.max_rows', None)  # None means unlimited
pd.set_option('display.max_columns', None)

In [4]:
def get_tickers():

    # Scrape the Wikipedia page
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the table with id 'constituents'
    table = soup.find('table', {'id': 'constituents'})

    # Convert the table to a DataFrame
    currentconstituents = pd.read_html(str(table))[0]

    # Extract the table with id 'changes'
    changes_table = soup.find('table', {'id': 'changes'})

    # Convert the table to a DataFrame
    spxchanges = pd.read_html(str(changes_table))[0]

    # Rename columns based on the provided R code and remove the first two rows
    spxchanges.columns = ['Date', 'AddTicker', 'AddName', 'RemovedTicker', 'RemovedName', 'Reason']
    spxchanges = spxchanges.iloc[2:].reset_index(drop=True)

    # Convert the 'Date' column to a datetime format
    spxchanges['Date'] = pd.to_datetime(spxchanges['Date'], format='%B %d, %Y')

    # Extract year and month columns
    spxchanges['year'] = spxchanges['Date'].dt.year
    spxchanges['month'] = spxchanges['Date'].dt.month

    # Create the month sequence
    currentmonth = pd.Timestamp(datetime.datetime.now().replace(day=1))
    monthseq = pd.date_range(start='1990-01-01', end=currentmonth, freq='MS')[::-1]

    # Initialize spxstocks DataFrame
    spxstocks = currentconstituents[['Symbol', 'Security']].copy()
    spxstocks.columns = ['Ticker', 'Name']
    spxstocks['Date'] = currentmonth
    lastrunstocks = spxstocks

    # Iterate through months, working backward in time
    for d in monthseq[1:]:
        y, m = d.year, d.month
        changes = spxchanges[(spxchanges['year'] == y) & (spxchanges['month'] == m)]

        # Remove added tickers
        tickerstokeep = lastrunstocks[~lastrunstocks['Ticker'].isin(changes['AddTicker'])].copy()
        tickerstokeep['Date'] = d

        # Add back the removed tickers
        tickerstoadd = changes[changes['RemovedTicker'].notnull()][['Date', 'RemovedTicker', 'RemovedName']]
        tickerstoadd.columns = ['Date', 'Ticker', 'Name']

        # Combine the data for this month
        thismonth = pd.concat([tickerstokeep, tickerstoadd], ignore_index=True)
        spxstocks = pd.concat([spxstocks, thismonth], ignore_index=True)

        lastrunstocks = thismonth

        grouped_spx = spxstocks.groupby('Ticker').first().reset_index()

        return currentconstituents, spxchanges, spxstocks, grouped_spx



In [5]:
currentconstituents, spxchanges, spxstocks, grouped_spx = get_tickers()

with open('spxstocks.pkl', 'wb') as file:
    pickle.dump([currentconstituents, spxchanges, spxstocks, grouped_spx], file)

  currentconstituents = pd.read_html(str(table))[0]
  spxchanges = pd.read_html(str(changes_table))[0]


In [2]:
with open('spxstocks.pkl', 'rb') as file:
    currentconstituents, spxchanges, spxstocks, grouped_spx = pickle.load(file)

In [3]:
currentconstituents.head(10)

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
5,ADBE,Adobe Inc.,Information Technology,Application Software,"San Jose, California",1997-05-05,796343,1982
6,AMD,Advanced Micro Devices,Information Technology,Semiconductors,"Santa Clara, California",2017-03-20,2488,1969
7,AES,AES Corporation,Utilities,Independent Power Producers & Energy Traders,"Arlington, Virginia",1998-10-02,874761,1981
8,AFL,Aflac,Financials,Life & Health Insurance,"Columbus, Georgia",1999-05-28,4977,1955
9,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,"Santa Clara, California",2000-06-05,1090872,1999


SessionNotCreatedException: Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir
Stacktrace:
#0 0x59260adf6d1a <unknown>
#1 0x59260a8b3040 <unknown>
#2 0x59260a8ecb94 <unknown>
#3 0x59260a8e88cf <unknown>
#4 0x59260a939439 <unknown>
#5 0x59260a938966 <unknown>
#6 0x59260a92a8a3 <unknown>
#7 0x59260a8f6a88 <unknown>
#8 0x59260a8f7bf1 <unknown>
#9 0x59260adbd87b <unknown>
#10 0x59260adc1761 <unknown>
#11 0x59260ada6012 <unknown>
#12 0x59260adc22d4 <unknown>
#13 0x59260ad8a2ef <unknown>
#14 0x59260ade5628 <unknown>
#15 0x59260ade5806 <unknown>
#16 0x59260adf5b96 <unknown>
#17 0x76e877a94ac3 <unknown>


In [None]:
# Let's assume spxstocks['Ticker'] contains the tickers of the S&P 500 companies
stock_prices = {}
for ticker in grouped_spx['Ticker']:
    stock_prices[ticker] = get_stock_price_data(ticker)

# Save the all_data dictionary to a pickle file
with open('stock_prices.pkl', 'wb') as file:
    pickle.dump(stock_prices, file)