In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.width', 200)

raw_qqq_data = pd.read_csv('QQQ-10Yr_History.csv')


In [24]:
print(raw_qqq_data.head(1))
print(raw_qqq_data.tail(1))


print("columns with missing values")
print(raw_qqq_data.columns[raw_qqq_data.isnull().any()])
print("rows with missing values")
print(raw_qqq_data[raw_qqq_data.isnull().any(axis=1)])

         Date    Open    High     Low   Close  Adj. Close Change    Volume
0  2015-04-21  108.22  108.41  107.93  108.06      99.925  0.43%  22759772
            Date    Open    High     Low  Close  Adj. Close  Change    Volume
2514  2025-04-17  447.17  447.75  441.36  444.1       444.1  -0.02%  44837047
columns with missing values
Index(['Change'], dtype='object')
rows with missing values
            Date     Open     High      Low   Close  Adj. Close Change    Volume
18    2015-05-15  109.860  110.010  109.310  109.58     101.330    NaN  23937442
255   2016-04-25  108.620  108.990  108.450  108.98     101.874    NaN  14351890
290   2016-06-14  107.780  108.415  107.240  108.03     100.986    NaN  24588506
1381  2020-10-13  296.340  297.050  293.110  294.52     286.334    NaN  64240031
1388  2020-10-22  285.050  285.980  280.820  284.18     276.281    NaN  32196162
1410  2020-11-23  291.410  292.750  288.070  290.39     282.318    NaN  26875588
1435  2020-12-30  314.160  314.490  312.

In [None]:

def prepare_data(df):
    X = df[['Open', 'High', 'Low', 'Close', 'Volume']].values[:-1]
    
    y = df[['Open', 'High', 'Low', 'Close']].values[1:]
    
    return X, y

# Prepare the data
X, y = prepare_data(raw_qqq_data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance Metrics:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.4f}")

feature_names = ['Open', 'High', 'Low', 'Close', 'Volume']
target_names = ['Next Open', 'Next High', 'Next Low', 'Next Close']

coef_df = pd.DataFrame(
    model.coef_,
    columns=feature_names,
    index=target_names
)

print("\nModel Coefficients:")
print(coef_df)

print("\nModel Intercepts:")
for target, intercept in zip(target_names, model.intercept_):
    print(f"{target}: {intercept:.4f}")


random_indices = np.random.choice(len(X_test), 5, replace=False)
print("\nRandom Sample Predictions vs Actual:")
for idx in random_indices:
    test_input = X_test[idx]
    actual = y_test[idx]
    pred = model.predict(test_input.reshape(1, -1))[0]
    
    print(f"\nInput Data:")
    for name, value in zip(feature_names, test_input):
        print(f"{name}: {value:.2f}")
    
    print("\nPredictions vs Actual:")
    for name, p, a in zip(target_names, pred, actual):
        print(f"{name}: Predicted={p:.2f}, Actual={a:.2f}, Error={((p-a)/a)*100:.2f}%")


mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"\nMean Absolute Percentage Error: {mape:.2f}%")


Model Performance Metrics:
Mean Squared Error: 8.68
R-squared Score: 0.9994

Model Coefficients:
                Open      High       Low     Close        Volume
Next Open  -0.221568  0.181322  0.284984  0.756728 -6.116027e-11
Next High  -0.140487  0.530240  0.018930  0.595034 -1.943179e-10
Next Low   -0.138404 -0.141587  0.530808  0.746315 -8.904775e-11
Next Close -0.171032  0.286166  0.310594  0.574455 -1.139337e-10

Model Intercepts:
Next Open: -0.0309
Next High: 0.0945
Next Low: 0.0974
Next Close: 0.1883

Random Sample Predictions vs Actual:

Input Data:
Open: 107.49
High: 107.72
Low: 106.74
Close: 107.07
Volume: 34809920.00

Predictions vs Actual:
Next Open: Predicted=107.12, Actual=108.13, Error=-0.93%
Next High: Predicted=107.84, Actual=108.35, Error=-0.47%
Next Low: Predicted=106.53, Actual=107.42, Error=-0.83%
Next Close: Predicted=107.29, Actual=107.92, Error=-0.59%

Input Data:
Open: 143.11
High: 144.89
Low: 143.00
Close: 144.65
Volume: 33642716.00

Predictions vs Actual:
N