# Imports

In [1]:
# Provides ways to work with large multidimensional arrays
import numpy as np 
# Allows for further data manipulation and analysis
import pandas as pd
from pandas_datareader import data as web # Reads stock data 
import matplotlib.pyplot as plt # Plotting
import matplotlib.dates as mdates # Styling dates
%matplotlib inline

import datetime as dt # For defining dates
import mplfinance as mpf # Matplotlib finance

import time

# Used to get data from a directory
import os
from os import listdir
from os.path import isfile, join

#Statsmodels is a great library we can use to run regressions.
import statsmodels.api as sm
# Seaborn extends the capabilities of Matplotlib
import seaborn as sns
# Used for calculating regressions
from statsmodels.tsa.ar_model import AutoReg, ar_select_order

# Dates & Other Constants

In [2]:
PATH = "./Stocks/"

# Start date defaults
S_YEAR = 2017
S_MONTH = 1
S_DAY = 3
S_DATE_STR = "2017-01-03"
S_DATE_DATETIME = dt.datetime(S_YEAR, S_MONTH, S_DAY)

# End date defaults
E_YEAR = 2021
E_MONTH = 8
E_DAY = 19
E_DATE_STR = "2021-10-29"
E_DATE_DATETIME = dt.datetime(E_YEAR, E_MONTH, E_DAY)

# Get Stock File Names in a List

In [3]:
files = [x for x in listdir(PATH) if isfile(join(PATH, x))]
tickers = [os.path.splitext(x)[0] for x in files]
len(tickers) # 2886 total stocks

3029

# Function that Returns a Dataframe from a CSV

In [4]:
def get_df_from_csv(folder, ticker):
    try:
        df = pd.read_csv(folder + ticker + '.csv', 
                         index_col='Date', 
                         parse_dates=True)
    except FileNotFoundError:
        pass
        # print("File Doesn't Exist")
    else:
        return df

# Save Dataframe to CSV

In [5]:
def save_dataframe_to_csv(df, folder, ticker):
    df.to_csv(folder + ticker + '.csv')

# Add Daily Return to Dataframe

In [6]:
# We calculate a percentage rate of return for each day to compare investments.
# Simple Rate of Return = (End Price - Beginning Price) / Beginning Price OR (EP / BP) - 1

# Shift provides the value from the previous day
# NaN is displayed because there was no previous day price for the 1st calculation
def add_daily_return_to_df(df, ticker):
    df['daily_return'] = (df['Adj Close'] / df['Adj Close'].shift(1)) - 1
    # Save data to a CSV file
    df.to_csv(PATH + ticker + '.csv')
    return df

# Regression Time Series Example with AutoReg

In [7]:
def calc_projected_roi(ticker):
    a_df = get_df_from_csv(PATH, ticker) 

    a_df = a_df.asfreq('d') # Change frequency to day
    a_df.index # Check frequency
    a_df = a_df.fillna(method='ffill') # Fill missing values

    # Delete unnamed column
    a_df.drop(a_df.columns[a_df.columns.str.contains('unnamed',case = False)],
          axis = 1, inplace = True)

    # Delete daily return column
    a_df = a_df.drop(['daily_return'], axis=1)
    
    # Figure out optimum lags which will be 1 or 2 for this data set
    lags = ar_select_order(a_df, maxlag=30)

    # Create our model using whole data set
    model = AutoReg(a_df['Adj Close'], lags.ar_lags)
    model_fit = model.fit()

    # Define training and testing area
    train_df = a_df.iloc[50:1369] # 80% minus 1st 50
    test_df = a_df.iloc[1369:] # Last 20%

    # Define training model for 500 days (Play with Number & Test)
    # and White's covariance estimator
    train_model = AutoReg(a_df['Adj Close'], 500).fit(cov_type="HC0")

    # Define start and end for prediction 
    start = len(train_df)
    end = len(train_df) + len(test_df) - 1

    prediction = train_model.predict(start=start, end=end, dynamic=True)

    # Predict 160 days into the future
    forecast = train_model.predict(start=end, end=end+60, dynamic=True)

    # Get starting price of prediction
    s_price = forecast.head(1).iloc[0]

    # Get the last price of prediction
    e_price = forecast.iloc[-1]

    # Get return over prediction
    return (e_price - s_price) / s_price

In [8]:
print("Projected ROI in Next 160 days :", calc_projected_roi("AAPL")) # For Apple Stock Data

Projected ROI in Next 160 days : 0.03655617012954882


In [9]:
def get_proj_rois():
    # Will hold all tickers & stock rois
    ticker = []
    roi = []
    
    for x in tickers:
        try:
            the_roi = calc_projected_roi(x)
        except Exception as ex:
            print(f"{x} : Stock Data Corrupted")
        else:
            ticker.append(x)
            roi.append(the_roi)
        
    return pd.DataFrame({'Ticker':ticker, 'ROI':roi})

In [10]:
'''proj_roi_df = get_proj_rois()
proj_roi_df'''
# The Reason for Corrupted Data is that these Stocks are Newer and Do Not have Stock Data starting from 2017
# As we defined our starting data as 2017-01-03 it gives Data Corrupted Error.

proj_roi_df = pd.read_csv("projected_stock_roi.csv")

In [11]:
# As Calsulating ROI is a very CPU intensive and a Time Consuming Process 
# Saving the ROI Data in csv file
'''proj_roi_df.to_csv("./projected_stock_roi.csv", index=None)'''

# Top 20 Stoks that would have hightest ROI as Per Our Prediction
proj_roi_df.sort_values(by="ROI", ascending=False).reset_index(drop=True)[:20]

Unnamed: 0,Ticker,ROI
0,MIC,21.955019
1,CPSH,6.198566
2,GME,4.787613
3,CEI,4.627012
4,CYTR,3.153769
5,CCXI,1.531457
6,MSTR,1.508713
7,SPWR,1.443123
8,AEHR,1.386954
9,APPS,1.241436
