In [2]:
from ucimlrepo import fetch_ucirepo

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

In [10]:
# fetch dataset 
dow_jones_index = fetch_ucirepo(id=312) 
  
# data (as pandas dataframes) 
df = dow_jones_index.data.features

In [11]:
df

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,days_to_next_dividend,percent_return_next_dividend
0,1,AA,1/7/2011,$15.82,$16.72,$15.78,$16.42,239655616,3.79267,,,$16.71,$15.97,26,0.182704
1,1,AA,1/14/2011,$16.71,$16.71,$15.64,$15.97,242963398,-4.42849,1.380223,239655616.0,$16.19,$15.79,19,0.187852
2,1,AA,1/21/2011,$16.19,$16.38,$15.60,$15.79,138428495,-2.47066,-43.024959,242963398.0,$15.87,$16.13,12,0.189994
3,1,AA,1/28/2011,$15.87,$16.63,$15.82,$16.13,151379173,1.63831,9.355500,138428495.0,$16.18,$17.14,5,0.185989
4,1,AA,2/4/2011,$16.18,$17.39,$16.18,$17.14,154387761,5.93325,1.987452,151379173.0,$17.33,$17.37,97,0.175029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2,XOM,5/27/2011,$80.22,$82.63,$80.07,$82.63,68230855,3.00424,-21.355713,86758820.0,$83.28,$81.18,75,0.568801
746,2,XOM,6/3/2011,$83.28,$83.75,$80.18,$81.18,78616295,-2.52161,15.221032,68230855.0,$80.93,$79.78,68,0.578960
747,2,XOM,6/10/2011,$80.93,$81.87,$79.72,$79.78,92380844,-1.42098,17.508519,78616295.0,$80.00,$79.02,61,0.589120
748,2,XOM,6/17/2011,$80.00,$80.82,$78.33,$79.02,100521400,-1.22500,8.811952,92380844.0,$78.65,$76.78,54,0.594786


In [18]:
attributes_of_interest = ['quarter', 'stock', 'open', 'high', 'low', 'close', 'volume', 'percent_return_next_dividend']
df_filter = df[attributes_of_interest]
df_filter.head()

Unnamed: 0,quarter,stock,open,high,low,close,volume,percent_return_next_dividend
0,1,AA,$15.82,$16.72,$15.78,$16.42,239655616,0.182704
1,1,AA,$16.71,$16.71,$15.64,$15.97,242963398,0.187852
2,1,AA,$16.19,$16.38,$15.60,$15.79,138428495,0.189994
3,1,AA,$15.87,$16.63,$15.82,$16.13,151379173,0.185989
4,1,AA,$16.18,$17.39,$16.18,$17.14,154387761,0.175029


In [19]:
df_filter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   quarter                       750 non-null    int64  
 1   stock                         750 non-null    object 
 2   open                          750 non-null    object 
 3   high                          750 non-null    object 
 4   low                           750 non-null    object 
 5   close                         750 non-null    object 
 6   volume                        750 non-null    int64  
 7   percent_return_next_dividend  750 non-null    float64
dtypes: float64(1), int64(2), object(5)
memory usage: 47.0+ KB


In [20]:
for attribute in ['open', 'high', 'low', 'close']:
  df_filter[attribute] = df_filter[attribute].str.replace('$', '').astype(float)

  


In [21]:
le = LabelEncoder()

stocks = df_filter['stock'].unique()
le.fit(stocks)
df_filter['stock'] = le.transform(df_filter['stock'])

In [22]:
df_filter.head()

Unnamed: 0,quarter,stock,open,high,low,close,volume,percent_return_next_dividend
0,1,0,15.82,16.72,15.78,16.42,239655616,0.182704
1,1,0,16.71,16.71,15.64,15.97,242963398,0.187852
2,1,0,16.19,16.38,15.6,15.79,138428495,0.189994
3,1,0,15.87,16.63,15.82,16.13,151379173,0.185989
4,1,0,16.18,17.39,16.18,17.14,154387761,0.175029


In [23]:
df_filter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   quarter                       750 non-null    int64  
 1   stock                         750 non-null    int32  
 2   open                          750 non-null    float64
 3   high                          750 non-null    float64
 4   low                           750 non-null    float64
 5   close                         750 non-null    float64
 6   volume                        750 non-null    int64  
 7   percent_return_next_dividend  750 non-null    float64
dtypes: float64(5), int32(1), int64(2)
memory usage: 44.1 KB


In [24]:
df_train = df_filter[df_filter['quarter'] == 1]
df_test = df_filter[df_filter['quarter'] == 2]

In [25]:
labels_train = df_train['percent_return_next_dividend'].copy()
df_train.drop(columns=['percent_return_next_dividend'], inplace = True)

labels_test = df_test['percent_return_next_dividend'].copy()
df_test.drop(columns=['percent_return_next_dividend'], inplace = True)

print(df_train.shape, labels_train.shape)
print(df_test.shape, labels_test.shape)

(360, 7) (360,)
(390, 7) (390,)


In [26]:
scaler = StandardScaler()
x_train = scaler.fit_transform(df_train)
x_test = scaler.fit_transform(df_test)

In [27]:
regressors = [LinearRegression(),
              DecisionTreeRegressor(),
              RandomForestRegressor(),
              GradientBoostingRegressor(),
              ExtraTreesRegressor(),
              MLPRegressor(hidden_layer_sizes=(1000,))]

regressors_names = ['LinearReg',
              'DecisionTreeReg',
              'RandomForestReg',
              'GradientBoostingReg',
              'ExtraTreesReg',
              'MLPReg']

$$MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$$

In [28]:
# Mean Squared Error
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

$$MAE = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i|$$

In [29]:
# Mean Absolute Error
def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

$$RSE = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2}$$

In [30]:
# Relative Absolute Error
def relative_absolute_error(y_true, y_pred):
    numerator = np.sum(np.abs(y_true - y_pred))
    denominator = np.sum(np.abs(y_true - np.mean(y_true)))
    return numerator / denominator

$$RAE = \frac{\sum_{i=1}^{n} |y_i - \hat{y}_i|}{\sum_{i=1}^{n} |y_i - \bar{y}|}$$

In [31]:
# Residual Standard Error
def residual_standard_error(y_true, y_pred):
    residual_sum_of_squares = np.sum((y_true - y_pred) ** 2)
    degrees_of_freedom = len(y_true) - 2
    return np.sqrt(residual_sum_of_squares / degrees_of_freedom)

$$MAPE = \frac{1}{n} \sum_{i=1}^{n} \left|\frac{y_i - \hat{y}_i}{y_i}\right| \times 100\%$$

In [32]:
# Mean absolute percentage error
def mean_absolute_percentage_error(y, y_hat):
    mape = np.mean(np.abs((y - y_hat) / y)) * 100
    return mape

$$R = \frac{\sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^{n} (x_i - \bar{x})^2 \sum_{i=1}^{n} (y_i - \bar{y})^2}}$$

In [33]:
# correlation coefficient
def correlation_coefficient(y, y_hat):
    y_mean = np.mean(y)
    y_hat_mean = np.mean(y_hat)
    numerator = np.sum((y - y_mean) * (y_hat - y_hat_mean))
    denominator = np.sqrt(np.sum((y - y_mean) ** 2) * np.sum((y_hat - y_hat_mean) ** 2))
    r = numerator / denominator
    return r

$$R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2}$$

In [34]:
# R-squared
def r_squared(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)