In [18]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [24]:
# Define the tickers for the companies in the Dow Jones Industrial Average
tickers = ['AAPL', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS', 'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PFE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WMT', 'XOM']

# Download stock data from Yahoo Finance
stocks = yf.download(tickers, start="2010-01-01", end="2022-12-31")

returns = stocks['Adj Close'].pct_change().dropna()

# Extract volume data
volume = stocks['Volume']

# Use the returns data and volume data to create the DataFrame for the analysis
data_returns = returns.stack().reset_index().rename(columns={'level_0': 'Date', 'level_1': 'Ticker', 0: 'Return'})
data_volume = volume.stack().reset_index().rename(columns={'level_0': 'Date', 'level_1': 'Ticker', 0: 'Volume'})

# Merge returns and volume data
data = pd.merge(data_returns, data_volume, on=['Date', 'Ticker'])


[*********************100%***********************]  29 of 29 completed


In [20]:
data.head()

Unnamed: 0,Date,Ticker,Return,Volume
0,2010-01-05,AAPL,0.001729,601904800
1,2010-01-05,AXP,-0.002199,10641200
2,2010-01-05,BA,0.032752,8867800
3,2010-01-05,CAT,0.011956,5697200
4,2010-01-05,CSCO,-0.004455,45124500


In [21]:
# Preprocess the data
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(['Ticker', 'Date'])

# Create lagged features for stock returns and trading volume
for lag in range(1, 11):
    data[f'Return_Lag_{lag}'] = data.groupby('Ticker')['Return'].shift(lag)
    data[f'Volume_Lag_{lag}'] = data.groupby('Ticker')['Volume'].shift(lag)

# Drop rows with missing values
data = data.dropna()

# Split the data into training and test sets (2010-2021)
train_data = data[data['Date'].dt.year < 2022]
X_train = train_data.drop(['Ticker', 'Date', 'Return'], axis=1)
y_train = train_data['Return']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the holdout set (2022)
holdout_data = data[data['Date'].dt.year == 2022]
X_holdout = holdout_data.drop(['Ticker', 'Date', 'Return'], axis=1)
y_holdout = holdout_data['Return']
X_holdout_scaled = scaler.transform(X_holdout)

In [22]:

# Linear regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
mse_lr = mean_squared_error(y_test, y_pred_lr)

# Ridge regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Lasso regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# xgb
xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)


print("MSE for Linear Regression:", mse_lr)
print("MSE for Ridge Regression:", mse_ridge)
print("MSE for Lasso Regression:", mse_lasso)
print("Mean Squared Error:", mse_xgb)



MSE for Linear Regression: 0.00024101244847487714
MSE for Ridge Regression: 0.00024101244306282715
MSE for Lasso Regression: 0.0002460429973577232
Mean Squared Error: 0.00022587775668707295
