In [1]:
!pip install yfinance[nospam,repair]



# Linear Regression Model on 1 Month of SPY Data

## Data Set-Up

In [17]:
# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

In [18]:
# Obtaining data for SPY from the past month (done on 1-18-2024)
data = yf.download("SPY", start = "2023-12-18", end = "2024-1-18")

[*********************100%%**********************]  1 of 1 completed


In [19]:
# Creating a dataframe with only Date and Closing Price
close_data = data[["Close"]]

In [20]:
# Verifying data within dataframe
close_data

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-12-18,471.970001
2023-12-19,474.839996
2023-12-20,468.26001
2023-12-21,472.700012
2023-12-22,473.649994
2023-12-26,475.649994
2023-12-27,476.51001
2023-12-28,476.690002
2023-12-29,475.309998
2024-01-02,472.649994


In [21]:
close_data.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-12-18,471.970001
2023-12-19,474.839996
2023-12-20,468.26001
2023-12-21,472.700012
2023-12-22,473.649994


In [22]:
# Resetting index to manipulate date column
close_data = close_data.reset_index()

In [23]:
# Verifying datatype of date
close_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    20 non-null     datetime64[ns]
 1   Close   20 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 452.0 bytes


In [24]:
# Saving earliest date for calculations
start_date_1m = close_data["Date"][0]
print(start_date_1m)

2023-12-18 00:00:00


In [25]:
# Creating a new column (dayspost121823) for count of days after start_date_1m
close_data['dayspost121823'] = close_data["Date"] - start_date_1m
close_data

Unnamed: 0,Date,Close,dayspost121823
0,2023-12-18,471.970001,0 days
1,2023-12-19,474.839996,1 days
2,2023-12-20,468.26001,2 days
3,2023-12-21,472.700012,3 days
4,2023-12-22,473.649994,4 days
5,2023-12-26,475.649994,8 days
6,2023-12-27,476.51001,9 days
7,2023-12-28,476.690002,10 days
8,2023-12-29,475.309998,11 days
9,2024-01-02,472.649994,15 days


In [26]:
# Verifying datatype for new column
close_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   Date            20 non-null     datetime64[ns] 
 1   Close           20 non-null     float64        
 2   dayspost121823  20 non-null     timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 612.0 bytes


In [27]:
# Converting new column datatype into integer
close_data['dayspost121823'] = close_data["dayspost121823"].dt.days
close_data

Unnamed: 0,Date,Close,dayspost121823
0,2023-12-18,471.970001,0
1,2023-12-19,474.839996,1
2,2023-12-20,468.26001,2
3,2023-12-21,472.700012,3
4,2023-12-22,473.649994,4
5,2023-12-26,475.649994,8
6,2023-12-27,476.51001,9
7,2023-12-28,476.690002,10
8,2023-12-29,475.309998,11
9,2024-01-02,472.649994,15


In [28]:
# Rearranging the dataframe
close_data = close_data[['Date', 'dayspost121823', 'Close']]
close_data

Unnamed: 0,Date,dayspost121823,Close
0,2023-12-18,0,471.970001
1,2023-12-19,1,474.839996
2,2023-12-20,2,468.26001
3,2023-12-21,3,472.700012
4,2023-12-22,4,473.649994
5,2023-12-26,8,475.649994
6,2023-12-27,9,476.51001
7,2023-12-28,10,476.690002
8,2023-12-29,11,475.309998
9,2024-01-02,15,472.649994


# Linear Regression Model Set-Up

In [29]:
# Reformat data of the independent variable X as a single-column array
X = close_data["dayspost121823"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[0],
       [1],
       [2],
       [3],
       [4]], dtype=int64)

In [30]:
# The shape of X should be 20 samples, with a single feature (column)
X.shape

(20, 1)

In [31]:
# Create an array for the dependent variable y
y = close_data["Close"]

In [32]:
# Create a model with scikit-learn
model = LinearRegression()

In [33]:
# Fit the data into the model
model.fit(X, y)

# Linear Regression Model Predictions

In [34]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.04694958]


In [35]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 472.699427997597


In [36]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

Model's formula: y = 0.04694958448355189X + 472.699427997597


In [37]:
# Display the formula to predict the closing price of SPY 1 week (7 days) after start date
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of the SPY 1 week after start date
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

Model's formula: y = 0.04694958448355189 * 7 + 472.699427997597
Predicted closing price of SPY after 7 days: $473.03


In [38]:
# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_close_predicted = close_data.copy()

# Add a column with the predicted salary values
df_close_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_close_predicted.head()

Unnamed: 0,Date,dayspost121823,Close,close_predicted
0,2023-12-18,0,471.970001,472.699428
1,2023-12-19,1,474.839996,472.746378
2,2023-12-20,2,468.26001,472.793327
3,2023-12-21,3,472.700012,472.840277
4,2023-12-22,4,473.649994,472.887226


In [39]:
# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line = df_close_predicted.hvplot.line(
    x = "dayspost121823",
    y = "close_predicted",
    color = "red"
)
best_fit_line

In [40]:
# Create a scatter plot with the closing price information
salary_plot = df_close_predicted.hvplot.scatter(
    x="dayspost121823",
    y="Close",
    title="SPY Daily Closing Price"
)
salary_plot

In [41]:
salary_plot * best_fit_line

In [42]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

In [43]:
# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.02079927739628651.
The r2 is 0.02079927739628651.
The mean squared error is 9.022078885370577.
The root mean squared error is 3.003677560153649.
The standard deviation is 3.0354106069192848.


The r-squared value is generally a good measure of how strong a linear association is between two variables. In more detail, it basically describe how much of the variation in the actual data (aka the true closing price of the SPY at a given x days after 12/18/2023) is accounted for by the linear regression model (represented in the best fit line in red). The close r-squared is to either -1 or 1, the more variation of the closing prices can be accounted for by the model, and the inverse is true if closer to 0.

In this model, the r-squared value is 0.02. This indicates that very little to none of the variation in the closing prices can be accounted for by the linear regression alone. Another test will be conducted with more data to determine if the r-squared value can be changed. It may also be necessary to split the data into training and testing data if sufficent data is provided (which should not be an issue since we can utilize up to 10 years of history if time allows).

In [44]:
# Notes on Linear Regression: 
# Linear regression: y = slope * x + y-intercept
# y = price
# x = days after start date
# slope = change in price/difference in days

# Linear Regression Model on 1 Year of SPY Data

## Data Set-Up

In [45]:
# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

# Obtaining data for SPY from the year 2023
data = yf.download("SPY", start = "2023-1-1", end = "2023-12-31")

# Creating a dataframe with only Date and Closing Price
close_df = data[["Close"]]

# Verifying data within dataframe
close_df.head()

# Resetting index to manipulate date column
close_df = close_df.reset_index()

# Verifying datatype of date
close_df.info()

# Saving earliest date for calculations
start_date_1y = close_df["Date"][0]
print(start_date_1y)

# Creating a new column (dayspost121823) for count of days after start_date_1y
close_df['dayspost1123'] = close_df["Date"] - start_date_1y
close_df

# Verifying datatype for new column
close_df.info()

# Converting new column datatype into integer
close_df['dayspost1123'] = close_df["dayspost1123"].dt.days
close_df

# Rearranging the dataframe
close_df = close_df[['Date', 'dayspost1123', 'Close']]
close_df

[*********************100%%**********************]  1 of 1 completed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    250 non-null    datetime64[ns]
 1   Close   250 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.0 KB
2023-01-03 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   Date          250 non-null    datetime64[ns] 
 1   Close         250 non-null    float64        
 2   dayspost1123  250 non-null    timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 6.0 KB





Unnamed: 0,Date,dayspost1123,Close
0,2023-01-03,0,380.820007
1,2023-01-04,1,383.760010
2,2023-01-05,2,379.380005
3,2023-01-06,3,388.079987
4,2023-01-09,6,387.859985
...,...,...,...
245,2023-12-22,353,473.649994
246,2023-12-26,357,475.649994
247,2023-12-27,358,476.510010
248,2023-12-28,359,476.690002


## Linear Regression Model Set-Up

In [46]:
# Reformat data of the independent variable X as a single-column array
X = close_df["dayspost1123"].values.reshape(-1, 1)

# Display sample data
X[:5]

# The shape of X should be 20 samples, with a single feature (column)
X.shape

# Create an array for the dependent variable y
y = close_df["Close"]

# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

## Linear Regression Model Predictions

In [47]:
# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

# Display the formula to predict the closing price of SPY 1 week (7 days) after start date
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of SPY 1 week after start date
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_closed_predicted = close_df.copy()

# Add a column with the predicted salary values
df_closed_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_closed_predicted.head()

Model's slope: [0.18451151]
Model's y-intercept: 394.08353917248587
Model's formula: y = 0.1845115051391308X + 394.08353917248587
Model's formula: y = 0.1845115051391308 * 7 + 394.08353917248587
Predicted closing price of SPY after 7 days: $395.38


Unnamed: 0,Date,dayspost1123,Close,close_predicted
0,2023-01-03,0,380.820007,394.083539
1,2023-01-04,1,383.76001,394.268051
2,2023-01-05,2,379.380005,394.452562
3,2023-01-06,3,388.079987,394.637074
4,2023-01-09,6,387.859985,395.190608


## Adding Plots and Analyzing Findings

In [48]:
# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line_2 = df_closed_predicted.hvplot.line(
    x = "dayspost1123",
    y = "close_predicted",
    color = "red"
)


# Create a scatter plot with the closing price information
salary_plot_2 = df_closed_predicted.hvplot.scatter(
    x="dayspost1123",
    y="Close",
    title="SPY Daily Closing Price"
)

# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.7033901684158216.
The r2 is 0.7033901684158216.
The mean squared error is 156.6321334368707.
The root mean squared error is 12.515276003223848.
The standard deviation is 22.97987492288892.


In [49]:
#Superimpose scatter plot onto best fit line
salary_plot_2 * best_fit_line_2

The model based on one year of SPY closing prices recorded an r-squared value of 0.7033, meaning that a majority of the variation in the closing prices based off of one year of data can be accounted for solely in the linear regression model. This could lead to the interpretation that this past year has been favorable to the growth of the SPY stock, as the slope indicates a gradual increase in the stock's price.

# Linear Regression Model on 3 Years of SPY Data

## Data Set-Up

In [50]:
# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

# Obtaining data for SPY from the years 2021-2023
data = yf.download("SPY", start = "2021-1-1", end = "2023-12-31")

# Creating a dataframe with only Date and Closing Price
close_df_1 = data[["Close"]]

# Verifying data within dataframe
close_df_1.head()

# Resetting index to manipulate date column
close_df_1 = close_df_1.reset_index()

# Verifying datatype of date
close_df_1.info()

# Saving earliest date for calculations
start_date_3y = close_df_1["Date"][0]
print(start_date_3y)

# Creating a new column (dayspost121823) for count of days after start_date_3y
close_df_1['dayspost1121'] = close_df_1["Date"] - start_date_3y
close_df_1

# Verifying datatype for new column
close_df_1.info()

# Converting new column datatype into integer
close_df_1['dayspost1121'] = close_df_1["dayspost1121"].dt.days
close_df_1

# Rearranging the dataframe
close_df_1 = close_df_1[['Date', 'dayspost1121', 'Close']]
close_df_1

[*********************100%%**********************]  1 of 1 completed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    753 non-null    datetime64[ns]
 1   Close   753 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 11.9 KB
2021-01-04 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   Date          753 non-null    datetime64[ns] 
 1   Close         753 non-null    float64        
 2   dayspost1121  753 non-null    timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 17.8 KB





Unnamed: 0,Date,dayspost1121,Close
0,2021-01-04,0,368.790009
1,2021-01-05,1,371.329987
2,2021-01-06,2,373.549988
3,2021-01-07,3,379.100006
4,2021-01-08,4,381.260010
...,...,...,...
748,2023-12-22,1082,473.649994
749,2023-12-26,1086,475.649994
750,2023-12-27,1087,476.510010
751,2023-12-28,1088,476.690002


## Linear Regression Model Set-Up

In [51]:
# Reformat data of the independent variable X as a single-column array
X = close_df_1["dayspost1121"].values.reshape(-1, 1)

# Display sample data
X[:5]

# The shape of X should be 20 samples, with a single feature (column)
X.shape

# Create an array for the dependent variable y
y = close_df_1["Close"]

# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

## Linear Regression Model Predictions

In [52]:
# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

# Display the formula to predict the closing price of SPY 1 week (7 days) after start date
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of SPY 1 week after start date
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_closed_predicted = close_df_1.copy()

# Add a column with the predicted salary values
df_closed_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_closed_predicted.head()

Model's slope: [0.00977546]
Model's y-intercept: 415.5414180177732
Model's formula: y = 0.009775461932527625X + 415.5414180177732
Model's formula: y = 0.009775461932527625 * 7 + 415.5414180177732
Predicted closing price of SPY after 7 days: $415.61


Unnamed: 0,Date,dayspost1121,Close,close_predicted
0,2021-01-04,0,368.790009,415.541418
1,2021-01-05,1,371.329987,415.551193
2,2021-01-06,2,373.549988,415.560969
3,2021-01-07,3,379.100006,415.570744
4,2021-01-08,4,381.26001,415.58052


## Creating Plots and Analyzing Findings

In [53]:
# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line_3 = df_closed_predicted.hvplot.line(
    x = "dayspost1121",
    y = "close_predicted",
    color = "red"
)


# Create a scatter plot with the closing price information
salary_plot_3 = df_closed_predicted.hvplot.scatter(
    x="dayspost1121",
    y="Close",
    title="SPY Daily Closing Price"
)

# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.01184734906617435.
The r2 is 0.01184734906617435.
The mean squared error is 792.6742237541264.
The root mean squared error is 28.15447075961696.
The standard deviation is 28.322745372098417.


In [54]:
#Superimpose scatter plot onto best fit line
salary_plot_3 * best_fit_line_3

In this linear regression model using three years of SPY closing price data, the r-squared value is 0.11. While not as low as the model using one month of data, this is a substantial drop in variation accounted for by similar types of data covering different time periods compared to the model using one year of data. Looking closer at the scatterplot, it is observable that the trend in the change of the SPY closing price is not different from most ETFs following the S&P 500 (IVV, VOO are a few examples). 

# Linear Regression Model With 1 Year of SPY Data Split

In [55]:
# Reformat data of the independent variable X as a single-column array
X = close_df["dayspost1123"].values.reshape(-1, 1)

# Display sample data
X[:5]

# The shape of X should be 20 samples, with a single feature (column)
X.shape

# Create an array for the dependent variable y
y = close_df["Close"]

# Splitting Data
from sklearn.model_selection import train_test_split

x_train, x_test,y_train,y_test = train_test_split(X,y, random_state = 10)

# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(x_train, y_train)

In [56]:
# Using the model to predict output with x_test
model.predict(x_test)

array([409.28282921, 428.66734013, 410.39051555, 435.12884377,
       422.94429405, 426.63658184, 446.39032154, 418.88277747,
       451.92875323, 407.6212997 , 438.26728839, 443.62110569,
       458.39025687, 451.19029567, 401.34441045, 454.14412591,
       411.3135875 , 433.83654304, 450.26722372, 417.77509114,
       410.02128677, 458.02102809, 431.80578475, 433.28269987,
       429.59041208, 409.09821482, 397.28289388, 403.92901191,
       456.54411297, 422.0212221 , 431.06732719, 448.97492299,
       415.74433285, 457.09795614, 444.17494886, 395.99059315,
       438.82113156, 400.97518167, 415.00587529, 422.75967966,
       399.68288095, 424.79043794, 410.20590116, 427.92888257,
       450.6364525 , 394.69829242, 441.59034741, 444.72879203,
       437.52883083, 410.57512994, 442.69803374, 413.15973139,
       445.2826352 , 406.69822775, 420.54430698, 439.74420351,
       435.86730133, 423.49813722, 399.86749534, 407.25207092,
       443.4364913 , 395.80597876, 436.05191572])

In [57]:
# Evaluating the accuracy of the linear regression model
model.score(x_test,y_test)

0.6749517066617914

Model predicts the price with 67% accuracy.

# Using the Model to Predict the SPY Closing Price on Any Date

In [58]:
# Variables before the loop
start_date_2023 = start_date_1y.to_pydatetime()
y_int = 394.0835391724858
slope = .1845115051391308

In [59]:
# While loop to ensure it only ends on a valid input
while True:
    try:
        date_entry = input('Predict the closing price of SPY by entering a date in YYYY-MM-DD format: ')
        year, month, day = map(int, date_entry.split('-'))
        date = dt.date(year, month, day)
        days_post_010323 = date - start_date_2023.date()
        predicted_price = y_int + slope * days_post_010323.days
        print("SPY Closing Price on "+ f"{date}: " + "$" + str(round(predicted_price, 2)))
        break
    except ValueError:
        print("Try again, make sure you type a properly formatted date.")

Predict the closing price of SPY by entering a date in YYYY-MM-DD format: 2024-01-23
SPY Closing Price on 2024-01-23: $465.12


In [60]:
## Potential Plans for Logistic Regression:
## Is change in SPY closing price + or negative based on date 
## (maybe add rest of data too)
## May need to calculate differences between current date closing price 
## & closing price of day before 
## (for day 1, may need to manually insert difference)
## Will test 1 year of data
## Variables are x = days after start, y = closing price 

Citations:

For splitting data to test linear regression model accuracy: https://medium.com/the-code-monster/split-a-dataset-into-train-and-test-datasets-using-sk-learn-acc7fd1802e0

For inputting date: https://stackoverflow.com/questions/15226898/python-3-2-input-date-function