In [1]:
pip install iexfinance




In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [56]:
import pandas as pd
import requests
import csv
import time
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
from iexfinance.stocks import get_historical_data, get_historical_intraday, Stock

In [280]:
#IEX Finance API no longer offers a free version so this key will have to be replaced weekly when the free trial expires
#public api key
api_key = "pk_f85211ff18244b6cb07ec6c9e37723f3"

In [275]:
def get_tickers():
    '''Returns an array of stock tickers from wikipedia.org/wiki/List_of_S%26P_500_companies'''
    html_text = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies").text
    soup = BeautifulSoup(html_text, "lxml")
    table = soup.find("table", class_ = "wikitable sortable")
    company_elements = table.find_all("a", class_ = "external text")

    tickers = []

    for company_element in company_elements:
        company = str(company_element.contents[0]).lower()
        if company not in tickers:
            tickers.append(company.upper())
            
    return tickers

In [46]:
tickers = get_tickers()

In [268]:
def create_sorted_historical_csv(tickers):
    '''Creates a sorted csv of historical stock data for the past 5 years for the given list of stock tickers'''
    start = datetime(2018, 6, 4)
    end = datetime(2023, 6, 4)

    df = pd.DataFrame()
    data_frame_list = []

    for ticker in tickers:
        df = get_historical_data(ticker, start, end, output_format='pandas', token=api_key)
        df = df.drop(["id", "subkey", "updated", "label", "change", "high", "fHigh", "uHigh", "low", "uLow", "fLow"], axis=1)
        data_frame_list.append(df)

    df2 = pd.concat(data_frame_list)
    df2 = df2.sort_values(by=["priceDate"])
    df2.insert(0, 'Helpers', range(0, len(df2)))
    df2 = df2.set_index("priceDate")
    df2.to_csv("SP500DataSetDateSorted.csv")
    print(df2)

In [9]:
def create_key_stats_csv(tickers):
    '''Creates a csv of stock statistics for the current past fiscal quarter for the given list of stock tickers'''
    df3 = pd.DataFrame()
    data_frame_list_2 = []

    for ticker in tickers:
        df3 = Stock(ticker, token=api_key).get_key_stats()
        data_frame_list_2.append(df3)

    df4 = pd.concat(data_frame_list_2)
    df4.to_csv("SP500DataSetKeyStats.csv")

In [273]:
#create_sorted_historical_csv(tickers)

In [279]:
def update_data_set(to_be_updated, tickers, start_date=str((datetime.strptime(pd.read_csv("SP500DataSetDateSorted.csv")["priceDate"].iloc[-1], '%Y-%m-%d') + timedelta(days=1)))[0:10], today_date=date.today()):
    '''Updates the given csv file to include the day after the last day already included to yesterdays date. Due to technical limitatons of the IEX Finance API, I was unable to find a way to use the get_historical_data function to pull stock data after the 4pm close as the API tracks after hours trading.'''
    df = pd.DataFrame()
    data_frame_list = []

    for ticker in tickers:
        df = get_historical_data(ticker, start_date, today_date, output_format='pandas', token=api_key)
        df = df.drop(["id", "subkey", "updated", "label", "change", "high", "fHigh", "uHigh", "low", "uLow", "fLow"], axis=1)
        data_frame_list.append(df)

    df2 = pd.concat(data_frame_list)
    df2 = df2.sort_values(by=["priceDate"])
    df_prev = pd.read_csv(to_be_updated)
    
    df_merged = pd.concat([df_prev, df2])
    
    day = str(date.today()-relativedelta(years=5))
    i = 1
    
    while True:
        if df_merged.index[df_merged["priceDate"].isin([day])==True].tolist() != []:
            #Removes the same number of days from the dataframe that were added to the dataframe. 
            #The range for removal goes from the date of the first element to the date of the last element added minus 5 years.
            #If that element doesn't exist, it will subtract one day and continue to do so until a valid day is found.
            df_merged.drop(df_merged.index[int(df_merged.iloc[0][1]):int(df_merged.index[df_merged["priceDate"].isin([day])==True].tolist()[0])], axis=0, inplace=True)
            break
        else:
            day = str(date.today()-relativedelta(years=5)-timedelta(days=i))
            if i == 5:
                break
            else:
                i+=1
        
    df_merged = df_merged.drop(["Helpers"], axis=1)
    df_merged.insert(0, "Helpers", range(0, len(df_merged)))
    df_merged = df_merged.set_index("priceDate")
    df_merged.to_csv("SP500DataSetDateSortedUpdated" + str(date.today()) + ".csv")

In [278]:
update_data_set("SP500DataSetDateSorted.csv", tickers)

In [44]:
read_file = pd.read_csv("NYSE Listed.csv")
nyse_tickers = read_file["ACT Symbol"].to_list()
#create_sorted_historical_csv(nyse_tickers)

### Markowitz Efficient Frontier

The Markowitz efficient frontier represents the boundary of the set of feasible portfolios (in this case portfolios consisting exclusively of equities) that have the maximum return for a given level of risk. Any portfolios above the frontier cannot be achieved. 

*Markowitz efficient frontier definition.* Nasdaq. (n.d.). https://www.nasdaq.com/glossary/m/markowitz-efficient-frontier 

![alt text](mpt-image-2.jpg "Title")

### Sharpe Ratio

The Sharpe Ratio is a measurement of the risk adjusted return of an asset compared to an asset with zero risk. While assets with zero risk only exist theoretically, near risk-free assets do exist and are used to calculate the risk-free rate. The most common method is to take the return of United States Treasury bonds over the same duration as the other asset and subtract the current rate of inflation. 

Sharpe, W. F. (n.d.). *The Sharpe Ratio.* The Sharpe Ratio. http://web.stanford.edu/~wfsharpe/art/sr/sr.htm 


$Sharpe Ratio = \frac{R_p - R_f}{\sigma_p}$

$R_p$ = return of portfolio \
$R_f$ = risk free rate \
$\sigma_p$ = standard deviation of the portfolio's excess return 

### Long Short-Term Memory Networks (LSTM)

LSTM stands for short term memory networks. It a type of recurrent neural network (RNN) that is capable of processing both individual data points as well as sequences of data. "The central role of an LSTM model is held by a memory cell known as a ‘cell state’ that maintains its state over time. It can be visualized as a conveyor belt through which information just flows, unchanged. nformation can be added to or removed from the cell state in LSTM and is regulated by gates. These gates optionally let the information flow in and out of the cell. It contains a pointwise multiplication operation and a sigmoid neural net layer that assist the mechanism. The sigmoid layer gives out numbers between zero and one, where zero means ‘nothing should be let through’, and one means ‘everything should be let through.'"  

What is LSTM - introduction to long short term memory. Intellipaat Blog. (2023, June 13). https://intellipaat.com/blog/what-is-lstm/?US#:~:text=The%20central%20role%20of%20an,which%20information%20just%20flows%2C%20unchanged. 