In [1]:
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.linalg import inv
import math

In [2]:
# Two telecommunication companies, two tech companies, and Walmart
expected_return = []
tickers = ['CMCSA','T', 'AAPL', 'MSFT', 'WMT']


start_date = '2014-01-01'
end_date = '2014-12-31'
for ticker in tickers:
    unit_cost = data.DataReader(ticker, 'yahoo', start_date)['Open'][0]
    adj_close = data.DataReader(ticker, 'yahoo', end_date)['Adj Close'][0]
    ticker_return = ((adj_close - unit_cost) / unit_cost)*100
    expected_return.append(ticker_return)
    
start_date = '2015-01-01'
end_date = '2015-12-31'
for ticker in tickers:
    unit_cost = data.DataReader(ticker, 'yahoo', start_date)['Open'][0]
    adj_close = data.DataReader(ticker, 'yahoo', end_date)['Adj Close'][0]
    ticker_return =  ((adj_close - unit_cost) / unit_cost)*100
    expected_return.append(ticker_return)
    
start_date = '2016-01-01'
end_date = '2016-12-30'
for ticker in tickers:
    unit_cost = data.DataReader(ticker, 'yahoo', start_date)['Open'][0]
    adj_close = data.DataReader(ticker, 'yahoo', end_date)['Adj Close'][0]
    ticker_return =  ((adj_close - unit_cost) / unit_cost)*100
    expected_return.append(ticker_return)
    
    
start_date = '2017-01-01'
end_date = '2017-12-29'
for ticker in tickers:
    unit_cost = data.DataReader(ticker, 'yahoo', start_date)['Open'][0]
    adj_close = data.DataReader(ticker, 'yahoo', end_date)['Adj Close'][0]
    ticker_return =  ((adj_close - unit_cost) / unit_cost)*100
    expected_return.append(ticker_return)
    
start_date = '2018-01-01'
end_date = '2018-12-31'
for ticker in tickers:
    unit_cost = data.DataReader(ticker, 'yahoo', start_date)['Open'][0]
    adj_close = data.DataReader(ticker, 'yahoo', end_date)['Adj Close'][0]
    ticker_return =  ((adj_close - unit_cost) / unit_cost)*100
    expected_return.append(ticker_return)

Here expected return for each stock is calculated throughout years 2014 to 2018. To see results, the last line in the following cell may be uncommented.

In [3]:
expected_return = np.array(expected_return)
years = [];
for i in range(2014, 2019):
    years.append(i)

returns_df = pd.DataFrame(expected_return.reshape(5,5), columns = tickers)
returns_df['year'] = years
#display(returns_df)

Unnamed: 0,CMCSA,T,AAPL,MSFT,WMT,year
0,1.705207,-28.38881,27.456902,11.344808,-3.731348,2014
1,-10.146865,-18.620589,-11.904417,9.353252,-35.554956,2015
2,18.108101,4.171407,7.569124,8.095585,6.627371,2016
3,9.813918,-20.169875,41.511218,31.603032,36.547809,2017
4,-18.278692,-32.165617,-8.887455,15.891587,-8.055473,2018


In the following cell we create a list of stocks and compare each one to their data from 2018 to 2014. The data files are read in and stored in a list which is used in the first outer loop to get the significant features for comparison. The outer loop iterates through each year. The inner loop takes each stock from the selected year and adds it to its own list. It then takes the appropriate Expected Return values (calculated above) and appends them to each stock as a new column. Lastly, we convert these lists to dataframes and store the dataframes in another list titled datatables. To see results, the last two lines in the following cell may be uncommented.

In [23]:
# Read in all datasets
df2018 = pd.read_csv("2018_Financial_Data.csv")
df2017 = pd.read_csv("2017_Financial_Data.csv")
df2016 = pd.read_csv("2016_Financial_Data.csv")
df2015 = pd.read_csv("2015_Financial_Data.csv")
df2014 = pd.read_csv("2014_Financial_Data.csv")

# Insert data into list for iteration and initialize other lists
datatables = []
listtotal = []
list_stocks = []
list_years = [df2018, df2017, df2016, df2015, df2014]
for i in range(len(list_years)):
    
    # Clear list_stocks for the upcoming year
    if len(list_stocks)!=0:
        list_stocks.clear()
        
    # Modify data to get desired amount of features
    financial_data = pd.DataFrame(list_years[i], columns = ['Symbol', 'Revenue', 'Revenue Growth', 'Gross Profit', 
                                                  'Operating Income', 'Earnings before Tax', 'Free Cash Flow', 
                                                  'Net Income', 'Total current assets',
                                                  'Operating Expenses',  'Net Debt', 'Short-term debt', 'Long-term debt', 
                                                  'Total shareholders equity', 'Weighted Average Shs Out', 
                                                  'Total current liabilities', 'Total debt', 'Total liabilities']) 
    
    for j in range(len(tickers)):
        list_stocks.append(pd.DataFrame(financial_data.loc[financial_data['Symbol'] == tickers[j], :]))
        
        if i == 0:
            # Add stocks to their own list
            listtotal.append(list_stocks[j])
            
        elif i == (len(list_years)-1):
            listtotal[j] = listtotal[j].append(list_stocks[j]) # Append stock data together throughout 5 years
            
            # Add on Expected Return column to each stock
            listtotal[j]['Expected Return'] = [expected_return[j],expected_return[j+len(tickers)], 
                                            expected_return[j+(2*len(tickers))], expected_return[j+(3*len(tickers))],
                                            expected_return[j+(4*len(tickers))]]
            
            datatables.append(pd.DataFrame(listtotal[j]))
            
        else:
            listtotal[j] = listtotal[j].append(list_stocks[j])
            
#for k in range(0, len(datatables)):
    #display(datatables[k])

In this cell we use multiple regression on our results. First we find the X and Y matrices for the normal equation, then we calculate the parameters based of of that. Finally, we use the training data and targets to find the Root-Mean-Square Error (RMSE) for the model we have. This tells us how far from the regression line the data points are. Basically, having the RMSE for each stock shows us the prediction errors.

In [24]:
# Create an array of the tables above
#dataTables = [s1_total, s2_total, s3_total, s4_total, s5_total]

# Loop through each of the tables and calculate the pararmeters for multiple linear regression 
# After calculating the parameters, test the regression and calcualte RMSE
for i in range(0, len(datatables)):
    
    # Find what are the X and Y matrices for normal equation
    x = datatables[i].drop('Symbol',1)
    x = x.drop('Expected Return', 1).to_numpy()
    y = datatables[i]['Expected Return'].to_numpy()
    
    # Calculate the parameters
    XTX = x.transpose().dot(x)
    XTX_inverse = inv(np.matrix(XTX))
    XTX_inverse_X = XTX_inverse.dot(x.transpose())
    param_theta = XTX_inverse_X.dot(y)
    

    # Find RMSE for the model using the training data and targets we have
    error_sum = 0
    for j in range (0,5):
        y_pred = param_theta.dot(x[j])
        #print(y_pred)
        pred_difference = y_pred - returns_df[tickers[i]][j]
        error_sum = error_sum + ((pred_difference**2) / 5)
    RMSE = math.sqrt(error_sum)
    print("RMSE for ", tickers[i], " is ", RMSE) #print the RMSE

RMSE for  CMCSA  is  2.3027827207312885
RMSE for  T  is  105.51891347566765
RMSE for  AAPL  is  129.46772053770098
RMSE for  MSFT  is  437.5444310631503
RMSE for  WMT  is  15.728002433485424
