In [2]:
import sys
import plotly
import cvxpy as cvx
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from sklearn.decomposition import PCA
from scipy import stats

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [3]:
# Function to fetch the list of S&P 500 companies from Wikipedia
def get_sp500_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'id': 'constituents'})
    tickers = []
    for row in table.find_all('tr')[1:]:
        ticker = row.find_all('td')[0].text.strip().replace('.', '-')
        tickers.append(ticker)
    return tickers

# Fetch the list of S&P 500 tickers
#sp500_tickers = get_sp500_tickers()
#print(sp500_tickers)

#AI and Utilities Stocks
sp500_tickers = ['NVDA', 'SOUN', 'PRCT', 'AVAV', 'ISRG', 'PEGA', 'HLX', 'CLW', 'SLVM', 'MERC']
print(sp500_tickers)

['NVDA', 'SOUN', 'PRCT', 'AVAV', 'ISRG', 'PEGA', 'HLX', 'CLW', 'SLVM', 'MERC']


In [4]:
# Function to fetch stock data for a list of tickers
def fetch_stock_data(tickers, start_date, end_date):
    all_data = []
    problematic_tickers = []
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            hist = stock.history(start=start_date, end=end_date)
            if not hist.empty:
                hist.reset_index(inplace=True)
                hist['Ticker'] = ticker
                all_data.append(hist)
            else:
                print(f"No data found for {ticker}")
                problematic_tickers.append(ticker)
        except Exception as e:
            print(f"Could not download data for {ticker}: {e}")
            problematic_tickers.append(ticker)
    if all_data:
        return pd.concat(all_data, ignore_index=True), problematic_tickers
    else:
        return pd.DataFrame(), problematic_tickers

In [5]:
# Fetch data for the desired date range
start_date = '2018-01-01'
end_date = '2023-12-31'
stock_data, problematic_tickers = fetch_stock_data(sp500_tickers, start_date, end_date)

if not stock_data.empty:
    # Format the DataFrame
    stock_data = stock_data[['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
    stock_data['Adj Close'] = stock_data['Close']  # Assuming 'Adj Close' is the same as 'Close' for simplicity
    stock_data['Adj Volume'] = stock_data['Volume']  # Assuming 'Adj Volume' is the same as 'Volume' for simplicity
    stock_data['Adj High'] = stock_data['High']  # Assuming 'Adj Close' is the same as 'Close' for simplicity
    stock_data['Adj Low'] = stock_data['Low']  # Assuming 'Adj Volume' is the same as 'Volume' for simplicity

    # Save to CSV
    file_path = 'sp500_stock_prices.csv'
    stock_data.to_csv(file_path, index=False)

    print("CSV file saved at:", file_path)
else:
    print("No stock data available to save.")

if problematic_tickers:
    print("Problematic tickers:", problematic_tickers)

CSV file saved at: sp500_stock_prices.csv


In [10]:
stock_data

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Adj Close,Adj Volume,Adj High,Adj Low
0,NVDA,2018-01-02 00:00:00-05:00,4.842343,4.934352,4.810685,4.930642,355616000,4.930642,355616000,4.934352,4.810685
1,NVDA,2018-01-03 00:00:00-05:00,5.048127,5.285570,5.039470,5.255147,914704000,5.255147,914704000,5.285570,5.039470
2,NVDA,2018-01-04 00:00:00-05:00,5.336522,5.393162,5.260590,5.282850,583268000,5.282850,583268000,5.393162,5.260590
3,NVDA,2018-01-05 00:00:00-05:00,5.297688,5.364964,5.220767,5.327616,580124000,5.327616,580124000,5.364964,5.220767
4,NVDA,2018-01-08 00:00:00-05:00,5.451286,5.565060,5.406270,5.490860,881216000,5.490860,881216000,5.565060,5.406270
...,...,...,...,...,...,...,...,...,...,...,...
12127,MERC,2023-12-22 00:00:00-05:00,9.345470,9.522543,9.315959,9.345470,162500,9.345470,162500,9.522543,9.315959
12128,MERC,2023-12-26 00:00:00-05:00,9.384819,9.561892,9.365145,9.493030,173700,9.493030,173700,9.561892,9.365145
12129,MERC,2023-12-27 00:00:00-05:00,9.552054,9.620915,9.384820,9.463518,125800,9.463518,125800,9.620915,9.384820
12130,MERC,2023-12-28 00:00:00-05:00,9.473356,9.493030,9.325795,9.374982,130800,9.374982,130800,9.493030,9.325795


In [12]:
#function to get mean for each stock
stock_data.groupby('Ticker').median()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Volume,Adj Close,Adj Volume,Adj High,Adj Low
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AVAV,2020-12-30 00:00:00-05:00,82.120003,83.43,80.709999,81.860001,193800.0,81.860001,193800.0,83.43,80.709999
CLW,2020-12-30 00:00:00-05:00,32.830002,33.34,32.25,32.810001,118500.0,32.810001,118500.0,33.34,32.25
HLX,2020-12-30 00:00:00-05:00,6.6,6.71,6.42,6.6,1526100.0,6.6,1526100.0,6.71,6.42
ISRG,2020-12-30 00:00:00-05:00,226.873337,229.289993,223.389999,227.113327,1748300.0,227.113327,1748300.0,229.289993,223.389999
MERC,2020-12-30 00:00:00-05:00,10.826749,10.997821,10.606943,10.794888,266000.0,10.794888,266000.0,10.997821,10.606943
NVDA,2020-12-30 00:00:00-05:00,13.107381,13.347691,12.913652,13.120126,438688000.0,13.120126,438688000.0,13.347691,12.913652
PEGA,2020-12-30 00:00:00-05:00,68.135104,69.452156,67.402251,68.313385,305400.0,68.313385,305400.0,69.452156,67.402251
PRCT,2022-11-04 00:00:00-04:00,35.529999,36.82,34.369999,35.59,301500.0,35.59,301500.0,36.82,34.369999
SLVM,2022-11-09 00:00:00-05:00,40.885809,41.634446,40.327081,40.928265,364800.0,40.928265,364800.0,41.634446,40.327081
SOUN,2023-03-01 00:00:00-05:00,2.65,2.77,2.49,2.61,5446800.0,2.61,5446800.0,2.77,2.49


In [27]:
open_prices = stock_data.pivot(index='Date', columns='Ticker', values='Open')
high_prices = stock_data.pivot(index='Date', columns='Ticker', values='High')
low_prices = stock_data.pivot(index='Date', columns='Ticker', values='Low')
close_prices = stock_data.pivot(index='Date', columns='Ticker', values='Close')
volume = stock_data.pivot(index='Date', columns='Ticker', values='Volume')
adj_close_prices = stock_data.pivot(index='Date', columns='Ticker', values='Adj Close')
adj_volume = stock_data.pivot(index='Date', columns='Ticker', values='Adj Volume')
close = stock_data.reset_index().pivot(index='Date', columns='Ticker', values='Adj Close')
high = stock_data.reset_index().pivot(index='Date', columns='Ticker', values='Adj High')
low = stock_data.reset_index().pivot(index='Date', columns='Ticker', values='Adj Low')

## Resample Adjusted Prices

The trading signal you'll develop in this project does not need to be based on daily prices, for instance, you can use month-end prices to perform trading once a month. To do this, you must first resample the daily adjusted closing prices into monthly buckets, and select the last observation of each month.

Implement the `resample_prices` to resample `close_prices` at the sampling frequency of `freq`.

In [34]:
def resample_prices(close_prices, freq='ME'):
    """
    Resample close prices for each ticker at specified frequency.
    
    Parameters
    ----------
    close_prices : DataFrame
        Close prices for each ticker and date
    freq : str
        What frequency to sample at
        For valid freq choices, see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    
    Returns
    -------
    prices_resampled : DataFrame
        Resampled prices for each ticker and date
    """
    # TODO: Implement Function
    
    prices = close_prices.resample(freq).last()
    
    return prices

In [40]:
monthly_close = resample_prices(close)
monthly_close

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-31 00:00:00-05:00,51.360001,47.049999,7.53,143.889999,11.832952,6.079519,50.226219,,,
2018-02-28 00:00:00-05:00,49.720001,37.599998,5.96,142.149994,10.625505,5.989251,57.288502,,,
2018-03-31 00:00:00-04:00,45.509998,39.099998,5.79,137.610001,10.122608,5.731614,59.936779,,,
2018-04-30 00:00:00-04:00,54.500000,23.650000,7.72,146.926666,10.895013,5.566042,60.332073,,,
2018-05-31 00:00:00-04:00,57.860001,24.049999,7.60,153.223328,13.049625,6.245301,61.122662,,,
...,...,...,...,...,...,...,...,...,...,...
2023-08-31 00:00:00-04:00,97.029999,38.290001,10.14,312.679993,8.791004,49.340225,49.547981,34.110001,40.472855,2.52
2023-09-30 00:00:00-04:00,111.529999,36.250000,11.17,292.290009,8.372000,43.489563,43.342178,32.810001,42.575466,2.01
2023-10-31 00:00:00-04:00,114.660004,33.810001,9.80,262.220001,7.747514,40.771152,42.673225,26.790001,43.518532,1.59
2023-11-30 00:00:00-05:00,137.610001,35.070000,9.32,310.839996,9.347756,46.759853,51.898788,37.070000,49.393044,2.14


## Compute Log Returns

Compute log returns ($R_t$) from prices ($P_t$) as your primary momentum indicator:

$$R_t = log_e(P_t) - log_e(P_{t-1})$$

Implement the `compute_log_returns` function below, such that it accepts a dataframe (like one returned by `resample_prices`), and produces a similar dataframe of log returns. Use Numpy's [log function](https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html) to help you calculate the log returns.

In [45]:
def compute_log_returns(prices):
    """
    Compute log returns for each ticker.
    
    Parameters
    ----------
    prices : DataFrame
        Prices for each ticker and date
    
    Returns
    -------
    log_returns : DataFrame
        Log returns for each ticker and date
    """
    # TODO: Implement Function
    
    return np.log(prices) - np.log(prices.shift(1))

In [49]:
monthly_close_returns = compute_log_returns(monthly_close)
monthly_close_returns

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-31 00:00:00-05:00,,,,,,,,,,
2018-02-28 00:00:00-05:00,-0.032452,-0.224207,-0.233825,-0.012166,-0.107631,-0.014959,0.131563,,,
2018-03-31 00:00:00-04:00,-0.088475,0.039118,-0.028938,-0.032459,-0.048486,-0.043969,0.045190,,,
2018-04-30 00:00:00-04:00,0.180269,-0.502759,0.287682,0.065510,0.073534,-0.029313,0.006574,,,
2018-05-31 00:00:00-04:00,0.059826,0.016772,-0.015666,0.041963,0.180454,0.115145,0.013019,,,
...,...,...,...,...,...,...,...,...,...,...
2023-08-31 00:00:00-04:00,0.018410,0.172291,0.054725,-0.036797,0.021124,0.054674,-0.060364,-0.009628,-0.161069,0.078391
2023-09-30 00:00:00-04:00,0.139273,-0.054749,0.096744,-0.067434,-0.048836,-0.126219,-0.133815,-0.038857,0.050647,-0.226124
2023-10-31 00:00:00-04:00,0.027678,-0.069683,-0.130849,-0.108563,-0.077521,-0.064546,-0.015555,-0.202705,0.021909,-0.234401
2023-11-30 00:00:00-05:00,0.182452,0.036589,-0.050220,0.170094,0.187764,0.137050,0.195724,0.324779,0.126623,0.297072


In [159]:
monthly_close = resample_prices(close)
monthly_close_returns = compute_log_returns(monthly_close)

## Shift Returns
Implement the `shift_returns` function to shift the log returns to the previous or future returns in the time series. For example, the parameter `shift_n` is 2 and `returns` is the following:
​
```
                           Returns
               A         B         C         D
2013-07-08     0.015     0.082     0.096     0.020     ...
2013-07-09     0.037     0.095     0.027     0.063     ...
2013-07-10     0.094     0.001     0.093     0.019     ...
2013-07-11     0.092     0.057     0.069     0.087     ...
...            ...       ...       ...       ...
```
​
the output of the `shift_returns` function would be:
```
                        Shift Returns
               A         B         C         D
2013-07-08     NaN       NaN       NaN       NaN       ...
2013-07-09     NaN       NaN       NaN       NaN       ...
2013-07-10     0.015     0.082     0.096     0.020     ...
2013-07-11     0.037     0.095     0.027     0.063     ...
...            ...       ...       ...       ...
```
Using the same `returns` data as above, the `shift_returns` function should generate the following with `shift_n` as -2:
```
                        Shift Returns
               A         B         C         D
2013-07-08     0.094     0.001     0.093     0.019     ...
2013-07-09     0.092     0.057     0.069     0.087     ...
...            ...       ...       ...       ...       ...
...            ...       ...       ...       ...       ...
...            NaN       NaN       NaN       NaN       ...
...            NaN       NaN       NaN       NaN       ...
```
_Note: The "..." represents data points we're not showing._

In [55]:
def shift_returns(returns, shift_n):
    """
    Generate shifted returns
    
    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date
    shift_n : int
        Number of periods to move, can be positive or negative
    
    Returns
    -------
    shifted_returns : DataFrame
        Shifted returns for each ticker and date
    """
    # TODO: Implement Function
    
    return returns.shift(shift_n)

In [59]:
prev_returns = shift_returns(monthly_close_returns, 1)
prev_returns
lookahead_returns = shift_returns(monthly_close_returns, -1)
lookahead_returns

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-31 00:00:00-05:00,-0.032452,-0.224207,-0.233825,-0.012166,-0.107631,-0.014959,0.131563,,,
2018-02-28 00:00:00-05:00,-0.088475,0.039118,-0.028938,-0.032459,-0.048486,-0.043969,0.045190,,,
2018-03-31 00:00:00-04:00,0.180269,-0.502759,0.287682,0.065510,0.073534,-0.029313,0.006574,,,
2018-04-30 00:00:00-04:00,0.059826,0.016772,-0.015666,0.041963,0.180454,0.115145,0.013019,,,
2018-05-31 00:00:00-04:00,0.210692,-0.040302,0.091715,0.040106,0.093894,-0.062544,-0.120474,,,
...,...,...,...,...,...,...,...,...,...,...
2023-08-31 00:00:00-04:00,0.139273,-0.054749,0.096744,-0.067434,-0.048836,-0.126219,-0.133815,-0.038857,0.050647,-0.226124
2023-09-30 00:00:00-04:00,0.027678,-0.069683,-0.130849,-0.108563,-0.077521,-0.064546,-0.015555,-0.202705,0.021909,-0.234401
2023-10-31 00:00:00-04:00,0.182452,0.036589,-0.050220,0.170094,0.187764,0.137050,0.195724,0.324779,0.126623,0.297072
2023-11-30 00:00:00-05:00,-0.087824,0.029501,0.098038,0.081872,-0.002352,0.057263,-0.061293,0.122716,-0.023545,-0.009390


## Generate Trading Signal

A trading signal is a sequence of trading actions, or results that can be used to take trading actions. A common form is to produce a "long" and "short" portfolio of stocks on each date (e.g. end of each month, or whatever frequency you desire to trade at). This signal can be interpreted as rebalancing your portfolio on each of those dates, entering long ("buy") and short ("sell") positions as indicated.

Here's a strategy that we will try:
> For each month-end observation period, rank the stocks by _previous_ returns, from the highest to the lowest. Select the top performing stocks for the long portfolio, and the bottom performing stocks for the short portfolio.

Implement the `get_top_n` function to get the top performing stock for each month. Get the top performing stocks from `prev_returns` by assigning them a value of 1. For all other stocks, give them a value of 0. For example, using the following `prev_returns`:

```
                                     Previous Returns
               A         B         C         D         E         F         G
2013-07-08     0.015     0.082     0.096     0.020     0.075     0.043     0.074
2013-07-09     0.037     0.095     0.027     0.063     0.024     0.086     0.025
...            ...       ...       ...       ...       ...       ...       ...
```

The function `get_top_n` with `top_n` set to 3 should return the following:
```
                                     Previous Returns
               A         B         C         D         E         F         G
2013-07-08     0         1         1         0         1         0         0
2013-07-09     0         1         0         1         0         1         0
...            ...       ...       ...       ...       ...       ...       ...
```
*Note: You may have to use Panda's [`DataFrame.iterrows`](https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.iterrows.html) with [`Series.nlargest`](https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.Series.nlargest.html) in order to implement the function. This is one of those cases where creating a vecorization solution is too difficult.*

In [62]:
def get_top_n(prev_returns, top_n):
    """
    Select the top performing stocks
    
    Parameters
    ----------
    prev_returns : DataFrame
        Previous shifted returns for each ticker and date
    top_n : int
        The number of top performing stocks to get
    
    Returns
    -------
    top_stocks : DataFrame
        Top stocks for each ticker and date marked with a 1
    """
    # TODO: Implement Function

    stocks = pd.DataFrame(0, index=prev_returns.index, columns=prev_returns.columns)

    for date, returns in prev_returns.iterrows():
            top_tickers = returns.nlargest(top_n).index
            stocks.loc[date, top_tickers] = 1
    return stocks

### View Data
We want to get the best performing and worst performing stocks. To get the best performing stocks, we'll use the `get_top_n` function. To get the worst performing stocks, we'll also use the `get_top_n` function. However, we pass in `-1*prev_returns` instead of just `prev_returns`. Multiplying by negative one will flip all the positive returns to negative and negative returns to positive. Thus, it will return the worst performing stocks.

In [65]:
top_bottom_n = 3
df_long = get_top_n(prev_returns, top_bottom_n)
df_short = get_top_n(-1*prev_returns, top_bottom_n)
df_long
df_short

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-31 00:00:00-05:00,1,1,1,0,0,0,0,0,0,0
2018-02-28 00:00:00-05:00,1,1,1,0,0,0,0,0,0,0
2018-03-31 00:00:00-04:00,0,1,1,0,1,0,0,0,0,0
2018-04-30 00:00:00-04:00,1,0,0,0,1,1,0,0,0,0
2018-05-31 00:00:00-04:00,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2023-08-31 00:00:00-04:00,1,0,0,1,0,0,0,0,0,1
2023-09-30 00:00:00-04:00,0,0,0,1,0,0,1,0,1,0
2023-10-31 00:00:00-04:00,0,0,0,0,0,1,1,0,0,1
2023-11-30 00:00:00-05:00,0,0,1,0,0,0,0,1,0,1


## Projected Returns
It's now time to check if your trading signal has the potential to become profitable!

We'll start by computing the net returns this portfolio would return. For simplicity, we'll assume every stock gets an equal dollar amount of investment. This makes it easier to compute a portfolio's returns as the simple arithmetic average of the individual stock returns.

Implement the `portfolio_returns` function to compute the expected portfolio returns. Using `df_long` to indicate which stocks to long and `df_short` to indicate which stocks to short, calculate the returns using `lookahead_returns`. To help with calculation, we've provided you with `n_stocks` as the number of stocks we're investing in a single period.

In [68]:
def portfolio_returns(df_long, df_short, lookahead_returns, n_stocks):
    """
    Compute expected returns for the portfolio, assuming equal investment in each long/short stock.
    
    Parameters
    ----------
    df_long : DataFrame
        Top stocks for each ticker and date marked with a 1
    df_short : DataFrame
        Bottom stocks for each ticker and date marked with a 1
    lookahead_returns : DataFrame
        Lookahead returns for each ticker and date
    n_stocks: int
        The number number of stocks chosen for each month
    
    Returns
    -------
    portfolio_returns : DataFrame
        Expected portfolio returns for each ticker and date
    """
    
    df_long_aligned, lookahead_returns_aligned = df_long.align(lookahead_returns, join='inner', axis=0)
    df_short_aligned, _ = df_short.align(lookahead_returns, join='inner', axis=0)
    
    long_returns = (df_long_aligned * lookahead_returns_aligned).sum(axis=1) / n_stocks
    short_returns = (df_short_aligned * lookahead_returns_aligned).sum(axis=1) / n_stocks
    
    short_returns = df_short * lookahead_returns
    short_returns = short_returns.sum(axis=1) / n_stocks
    
    net_returns = (df_long_aligned - df_short_aligned) * lookahead_returns_aligned / n_stocks   
    
    return net_returns

In [72]:
expected_portfolio_returns = portfolio_returns(df_long, df_short, lookahead_returns, 2*top_bottom_n)
expected_portfolio_returns

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-31 00:00:00-05:00,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,,,
2018-02-28 00:00:00-05:00,-0.000000,0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,,,
2018-03-31 00:00:00-04:00,0.000000,0.083793,-0.047947,0.010918,-0.012256,-0.004885,0.001096,,,
2018-04-30 00:00:00-04:00,-0.009971,0.002795,-0.002611,0.000000,-0.030076,-0.019191,0.002170,,,
2018-05-31 00:00:00-04:00,0.035115,0.006717,0.015286,0.000000,0.015649,0.010424,0.020079,,,
...,...,...,...,...,...,...,...,...,...,...
2023-08-31 00:00:00-04:00,-0.023212,-0.000000,0.016124,0.011239,-0.000000,-0.021036,-0.000000,-0.000000,0.008441,0.037687
2023-09-30 00:00:00-04:00,0.000000,-0.011614,-0.021808,0.018094,-0.000000,-0.000000,0.002592,-0.000000,-0.003651,-0.039067
2023-10-31 00:00:00-04:00,0.030409,0.000000,-0.008370,0.000000,0.000000,-0.022842,-0.032621,0.000000,0.021104,-0.049512
2023-11-30 00:00:00-05:00,-0.014637,0.000000,-0.016340,0.000000,-0.000000,0.000000,-0.010215,-0.020453,-0.003924,0.001565


In [74]:
expected_portfolio_returns_by_date = expected_portfolio_returns.T.sum().dropna()
portfolio_ret_mean = expected_portfolio_returns_by_date.mean()
portfolio_ret_ste = expected_portfolio_returns_by_date.sem()
portfolio_ret_annual_rate = (np.exp(portfolio_ret_mean * 12) - 1) * 100

print(expected_portfolio_returns_by_date)
print("""
Mean:                       {:.6f}
Standard Error:             {:.6f}
Annualized Rate of Return:  {:.2f}%
""".format(portfolio_ret_mean, portfolio_ret_ste, portfolio_ret_annual_rate))

Date
2018-01-31 00:00:00-05:00    0.000000
2018-02-28 00:00:00-05:00    0.000000
2018-03-31 00:00:00-04:00    0.030719
2018-04-30 00:00:00-04:00   -0.056883
2018-05-31 00:00:00-04:00    0.103270
                               ...   
2023-08-31 00:00:00-04:00    0.029243
2023-09-30 00:00:00-04:00   -0.055454
2023-10-31 00:00:00-04:00   -0.061832
2023-11-30 00:00:00-05:00   -0.064004
2023-12-31 00:00:00-05:00    0.000000
Freq: ME, Length: 72, dtype: float64

Mean:                       0.000126
Standard Error:             0.007745
Annualized Rate of Return:  0.15%



The annualized rate of return allows you to compare the rate of return from this strategy to other quoted rates of return, which are usually quoted on an annual basis. 

### T-Test
Our null hypothesis ($H_0$) is that the actual mean return from the signal is zero. We'll perform a one-sample, one-sided t-test on the observed mean return, to see if we can reject $H_0$.

We'll need to first compute the t-statistic, and then find its corresponding p-value. The p-value will indicate the probability of observing a t-statistic equally or more extreme than the one we observed if the null hypothesis were true. A small p-value means that the chance of observing the t-statistic we observed under the null hypothesis is small, and thus casts doubt on the null hypothesis. It's good practice to set a desired level of significance or alpha ($\alpha$) _before_ computing the p-value, and then reject the null hypothesis if $p < \alpha$.

For this project, we'll use $\alpha = 0.05$, since it's a common value to use.

Implement the `analyze_alpha` function to perform a t-test on the sample of portfolio returns. We've imported the `scipy.stats` module for you to perform the t-test.

Note: [`scipy.stats.ttest_1samp`](https://docs.scipy.org/doc/scipy-1.0.0/reference/generated/scipy.stats.ttest_1samp.html) performs a two-sided test, so divide the p-value by 2 to get 1-sided p-value

In [77]:
def analyze_alpha(expected_portfolio_returns_by_date):
    """
    Perform a t-test with the null hypothesis being that the expected mean return is zero.
    
    Parameters
    ----------
    expected_portfolio_returns_by_date : Pandas Series
        Expected portfolio returns for each date
    
    Returns
    -------
    t_value
        T-statistic from t-test
    p_value
        Corresponding p-value
    """
    # TODO: Implement Function
    
    t_stat, p_value_two_sided = stats.ttest_1samp(expected_portfolio_returns_by_date, 0)
    
    p_value_one_sided = p_value_two_sided / 2
    
    return t_stat, p_value_one_sided

In [79]:
t_value, p_value = analyze_alpha(expected_portfolio_returns_by_date)
print("""
Alpha analysis:
 t-value:        {:.3f}
 p-value:        {:.6f}
""".format(t_value, p_value))


Alpha analysis:
 t-value:        0.016
 p-value:        0.493538



## The Alpha Research Process

In this project you will code and evaluate a "breakout" signal. It is important to understand where these steps fit in the alpha research workflow. The signal-to-noise ratio in trading signals is very low and, as such, it is very easy to fall into the trap of _overfitting_ to noise. It is therefore inadvisable to jump right into signal coding. To help mitigate overfitting, it is best to start with a general observation and hypothesis; i.e., you should be able to answer the following question _before_ you touch any data:

> What feature of markets or investor behaviour would lead to a persistent anomaly that my signal will try to use?

Ideally the assumptions behind the hypothesis will be testable _before_ you actually code and evaluate the signal itself. The workflow therefore is as follows:

![image](images/alpha_steps.png)

In this project, we assume that the first three steps area done ("observe & research", "form hypothesis", "validate hypothesis"). The hypothesis you'll be using for this project is the following:
- In the absence of news or significant investor trading interest, stocks oscillate in a range.
- Traders seek to capitalize on this range-bound behaviour periodically by selling/shorting at the top of the range and buying/covering at the bottom of the range. This behaviour reinforces the existence of the range.
- When stocks break out of the range, due to, e.g., a significant news release or from market pressure from a large investor:
    - the liquidity traders who have been providing liquidity at the bounds of the range seek to cover their positions to mitigate losses, thus magnifying the move out of the range, _and_
    - the move out of the range attracts other investor interest; these investors, due to the behavioural bias of _herding_ (e.g., [Herd Behavior](https://www.investopedia.com/university/behavioral_finance/behavioral8.asp)) build positions which favor continuation of the trend.


Using this hypothesis, let start coding..
## Compute the Highs and Lows in a Window
You'll use the price highs and lows as an indicator for the breakout strategy. In this section, implement `get_high_lows_lookback` to get the maximum high price and minimum low price over a window of days. The variable `lookback_days` contains the number of days to look in the past. Make sure this doesn't include the current day.

In [82]:
def get_high_lows_lookback(high, low, lookback_days):
    """
    Get the highs and lows in a lookback window.
    
    Parameters
    ----------
    high : DataFrame
        High price for each ticker and date
    low : DataFrame
        Low price for each ticker and date
    lookback_days : int
        The number of days to look back
    
    Returns
    -------
    lookback_high : DataFrame
        Lookback high price for each ticker and date
    lookback_low : DataFrame
        Lookback low price for each ticker and date
    """
    #TODO: Implement function

    return high.shift(1).rolling(window=lookback_days).max(), low.shift(1).rolling(window=lookback_days).min()

In [84]:
lookback_days = 50
lookback_high, lookback_low = get_high_lows_lookback(high, low, lookback_days)
lookback_high
lookback_low

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02 00:00:00-05:00,,,,,,,,,,
2018-01-03 00:00:00-05:00,,,,,,,,,,
2018-01-04 00:00:00-05:00,,,,,,,,,,
2018-01-05 00:00:00-05:00,,,,,,,,,,
2018-01-08 00:00:00-05:00,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,104.769997,31.379999,8.82,254.850006,7.454787,39.221489,37.601163,24.83,40.070455,1.49
2023-12-26 00:00:00-05:00,109.680000,31.379999,8.82,254.850006,7.454787,39.221489,37.601163,24.83,40.739441,1.49
2023-12-27 00:00:00-05:00,111.400002,31.379999,8.82,254.850006,7.454787,39.221489,37.601163,24.83,40.739441,1.49
2023-12-28 00:00:00-05:00,112.279999,31.379999,8.82,254.850006,7.454787,39.221489,37.601163,24.83,40.739441,1.49


## Compute Long and Short Signals
Using the generated indicator of highs and lows, create long and short signals using a breakout strategy. Implement `get_long_short` to generate the following signals:

| Signal | Condition |
|----|------|
| -1 | Low > Close Price |
| 1  | High < Close Price |
| 0  | Otherwise |

In this chart, **Close Price** is the `close` parameter. **Low** and **High** are the values generated from `get_high_lows_lookback`, the `lookback_high` and `lookback_low` parameters.

In [87]:
def get_long_short(close, lookback_high, lookback_low):
    """
    Generate the signals long, short, and do nothing.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookback_high : DataFrame
        Lookback high price for each ticker and date
    lookback_low : DataFrame
        Lookback low price for each ticker and date
    
    Returns
    -------
    long_short : DataFrame
        The long, short, and do nothing signals for each ticker and date
    """
    #TODO: Implement function
    
    long_short = pd.DataFrame(0, index=close.index, columns=close.columns)
    
    long_signals = close >= lookback_high
    
    short_signals = close <= lookback_low
    
    long_short[long_signals] = 1
    long_short[short_signals] = -1
    
    return long_short

In [89]:
signal = get_long_short(close, lookback_high, lookback_low)
print(signal)

Ticker                     AVAV  CLW  HLX  ISRG  MERC  NVDA  PEGA  PRCT  SLVM  \
Date                                                                            
2018-01-02 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2018-01-03 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2018-01-04 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2018-01-05 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2018-01-08 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
...                         ...  ...  ...   ...   ...   ...   ...   ...   ...   
2023-12-22 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2023-12-26 00:00:00-05:00     0    0    0     1     0     0     0     0     0   
2023-12-27 00:00:00-05:00     0    0    0     0     0     0     0     0     0   
2023-12-28 00:00:00-05:00     0    0    0     1     0     0     0     0     0   
2023-12-29 00:00:00-05:00   

## Filter Signals
That was a lot of repeated signals! If we're already shorting a stock, having an additional signal to short a stock isn't helpful for this strategy. This also applies to additional long signals when the last signal was long.

Implement `filter_signals` to filter out repeated long or short signals within the `lookahead_days`. If the previous signal was the same, change the signal to `0` (do nothing signal). For example, say you have a single stock time series that is

`[1, 0, 1, 0, 1, 0, -1, -1]`

Running `filter_signals` with a lookahead of 3 days should turn those signals into

`[1, 0, 0, 0, 1, 0, -1, 0]`

To help you implement the function, we have provided you with the `clear_signals` function. This will remove all signals within a window after the last signal. For example, say you're using a windows size of 3 with `clear_signals`. It would turn the Series of long signals

`[0, 1, 0, 0, 1, 1, 0, 1, 0]`

into

`[0, 1, 0, 0, 0, 1, 0, 0, 0]`

`clear_signals` only takes a Series of the same type of signals, where `1` is the signal and `0` is no signal. It can't take a mix of long and short signals. Using this function, implement `filter_signals`. 

For implementing `filter_signals`, we don't reccommend you try to find a vectorized solution. Instead, you should use the [`iterrows`](https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.iterrows.html) over each column.

In [110]:

def clear_signals(signals, window_size):
    """
    Clear out signals in a Series of just long or short signals.
    
    Remove the number of signals down to 1 within the window size time period.
    
    Parameters
    ----------
    signals : Pandas Series
        The long, short, or do nothing signals
    window_size : int
        The number of days to have a single signal       
    
    Returns
    -------
    signals : Pandas Series
        Signals with the signals removed from the window size
    """
   # Start with buffer of window size
    # This handles the edge case of calculating past_signal in the beginning
    clean_signals = [0]*window_size
    
    for signal_i, current_signal in enumerate(signals):
        # Check if there was a signal in the past window_size of days
        has_past_signal = bool(sum(clean_signals[signal_i:signal_i+window_size]))
        # Use the current signal if there's no past signal, else 0/False
        clean_signals.append(not has_past_signal and current_signal)
        
    # Remove buffer
    clean_signals = clean_signals[window_size:]

    # Return the signals as a Series of Ints
    return pd.Series(np.array(clean_signals).astype(int), signals.index)

In [112]:
def filter_signals(signal, lookahead_days):
    """
    Filter out signals in a DataFrame.
    
    Parameters
    ----------
    signal : DataFrame
        The long, short, and do nothing signals for each ticker and date
    lookahead_days : int
        The number of days to look ahead
    
    Returns
    -------
    filtered_signal : DataFrame
        The filtered long, short, and do nothing signals for each ticker and date
    """
    #TODO: Implement function
    
    filtered_signal = pd.DataFrame(index=signal.index, columns=signal.columns)
    
    for ticker in signal.columns:
        ticker_signals = signal[ticker]
        
        long_signals = ticker_signals.copy()
        short_signals = ticker_signals.copy()
        
        long_signals[long_signals != 1] = 0
        short_signals[short_signals != -1] = 0
        
        cleared_long_signals = clear_signals(long_signals, lookahead_days)
        cleared_short_signals = clear_signals(short_signals, lookahead_days)
        
        combined_signals = cleared_long_signals + cleared_short_signals
        filtered_signal[ticker] = combined_signals

    return filtered_signal

In [116]:
signal_5 = filter_signals(signal, 5)
signal_10 = filter_signals(signal, 10)
signal_20 = filter_signals(signal, 20)
signal_5
signal_10
signal_20

Ticker,AVAV,CLW,HLX,ISRG,MERC,NVDA,PEGA,PRCT,SLVM,SOUN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2018-01-03 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2018-01-04 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2018-01-05 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2018-01-08 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2023-12-26 00:00:00-05:00,0,0,0,1,0,0,0,0,0,0
2023-12-27 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0
2023-12-28 00:00:00-05:00,0,0,0,0,0,0,0,0,0,0


In [233]:
def get_lookahead_prices(close, lookahead_days):
    """
    Get the lookahead prices for `lookahead_days` number of days.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookahead_days : int
        The number of days to look ahead
    
    Returns
    -------
    lookahead_prices : DataFrame
        The lookahead prices for each ticker and date
    """
    #TODO: Implement function
    
    return close.shift(-lookahead_days)

In [235]:
lookahead_5 = get_lookahead_prices(close, 5)
lookahead_10 = get_lookahead_prices(close, 10)
lookahead_20 = get_lookahead_prices(close, 20)

In [237]:
def get_return_lookahead(close, lookahead_prices):
    """
    Calculate the log returns from the lookahead days to the signal day.
    
    Parameters
    ----------
    close : DataFrame
        Close price for each ticker and date
    lookahead_prices : DataFrame
        The lookahead prices for each ticker and date
    
    Returns
    -------
    lookahead_returns : DataFrame
        The lookahead log returns for each ticker and date
    """
    #TODO: Implement function
    
    return np.log(lookahead_prices / close)

In [239]:
price_return_5 = get_return_lookahead(close, lookahead_5)
price_return_10 = get_return_lookahead(close, lookahead_10)
price_return_20 = get_return_lookahead(close, lookahead_20)

In [241]:
def get_signal_return(signal, lookahead_returns):
    """
    Compute the signal returns.
    
    Parameters
    ----------
    signal : DataFrame
        The long, short, and do nothing signals for each ticker and date
    lookahead_returns : DataFrame
        The lookahead log returns for each ticker and date
    
    Returns
    -------
    signal_return : DataFrame
        Signal returns for each ticker and date
    """
    #TODO: Implement function
    
    return signal * lookahead_returns

In [243]:
title_string = '{} day LookaheadSignal Returns for {} Stock'
signal_return_5 = get_signal_return(signal_5, price_return_5)
signal_return_10 = get_signal_return(signal_10, price_return_10)
signal_return_20 = get_signal_return(signal_20, price_return_20)

NameError: name 'signal_5' is not defined