In [1]:
!pip install yfinance[nospam,repair]



# Data Set-Up

In [1]:
# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

In [3]:
# Obtaining data for SPY from the past month (done on 1-18-2024)
data = yf.download("SPY", start = "2023-12-18", end = "2024-1-18")

[*********************100%%**********************]  1 of 1 completed


In [4]:
# Creating a dataframe with only Date and Closing Price
close_data = data[["Close"]]

In [5]:
# Verifying data within dataframe
close_data

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-12-18,471.970001
2023-12-19,474.839996
2023-12-20,468.26001
2023-12-21,472.700012
2023-12-22,473.649994
2023-12-26,475.649994
2023-12-27,476.51001
2023-12-28,476.690002
2023-12-29,475.309998
2024-01-02,472.649994


In [6]:
close_data.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-12-18,471.970001
2023-12-19,474.839996
2023-12-20,468.26001
2023-12-21,472.700012
2023-12-22,473.649994


In [7]:
# Resetting index to manipulate date column
close_data = close_data.reset_index()

In [8]:
# Verifying datatype of date
close_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    20 non-null     datetime64[ns]
 1   Close   20 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 452.0 bytes


In [9]:
# Saving earliest date for calculations
start_date = close_data["Date"][0]
print(start_date)

2023-12-18 00:00:00


In [10]:
# Creating a new column (dayspost121823) for count of days after start_date
close_data['dayspost121823'] = close_data["Date"] - start_date
close_data

Unnamed: 0,Date,Close,dayspost121823
0,2023-12-18,471.970001,0 days
1,2023-12-19,474.839996,1 days
2,2023-12-20,468.26001,2 days
3,2023-12-21,472.700012,3 days
4,2023-12-22,473.649994,4 days
5,2023-12-26,475.649994,8 days
6,2023-12-27,476.51001,9 days
7,2023-12-28,476.690002,10 days
8,2023-12-29,475.309998,11 days
9,2024-01-02,472.649994,15 days


In [11]:
# Verifying datatype for new column
close_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   Date            20 non-null     datetime64[ns] 
 1   Close           20 non-null     float64        
 2   dayspost121823  20 non-null     timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 612.0 bytes


In [12]:
# Converting new column datatype into integer
close_data['dayspost121823'] = close_data["dayspost121823"].dt.days
close_data

Unnamed: 0,Date,Close,dayspost121823
0,2023-12-18,471.970001,0
1,2023-12-19,474.839996,1
2,2023-12-20,468.26001,2
3,2023-12-21,472.700012,3
4,2023-12-22,473.649994,4
5,2023-12-26,475.649994,8
6,2023-12-27,476.51001,9
7,2023-12-28,476.690002,10
8,2023-12-29,475.309998,11
9,2024-01-02,472.649994,15


In [13]:
# Rearranging the dataframe
close_data = close_data[['Date', 'dayspost121823', 'Close']]
close_data

Unnamed: 0,Date,dayspost121823,Close
0,2023-12-18,0,471.970001
1,2023-12-19,1,474.839996
2,2023-12-20,2,468.26001
3,2023-12-21,3,472.700012
4,2023-12-22,4,473.649994
5,2023-12-26,8,475.649994
6,2023-12-27,9,476.51001
7,2023-12-28,10,476.690002
8,2023-12-29,11,475.309998
9,2024-01-02,15,472.649994


# Linear Regression Model Set-Up

In [14]:
# Reformat data of the independent variable X as a single-column array
X = close_data["dayspost121823"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[0],
       [1],
       [2],
       [3],
       [4]], dtype=int64)

In [15]:
# The shape of X should be 20 samples, with a single feature (column)
X.shape

(20, 1)

In [16]:
# Create an array for the dependent variable y
y = close_data["Close"]

In [17]:
# Create a model with scikit-learn
model = LinearRegression()

In [18]:
# Fit the data into the model
model.fit(X, y)

# Linear Regression Model Predictions

In [19]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.04694958]


In [20]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 472.699427997597


In [21]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

Model's formula: y = 0.04694958448355189X + 472.699427997597


In [22]:
# Display the formula to predict the closing price of SPY after 1 week (7 days)
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of the SPY after 1 week
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

Model's formula: y = 0.04694958448355189 * 7 + 472.699427997597
Predicted closing price of SPY after 7 days: $473.03


In [23]:
# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_close_predicted = close_data.copy()

# Add a column with the predicted salary values
df_close_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_close_predicted.head()

Unnamed: 0,Date,dayspost121823,Close,close_predicted
0,2023-12-18,0,471.970001,472.699428
1,2023-12-19,1,474.839996,472.746378
2,2023-12-20,2,468.26001,472.793327
3,2023-12-21,3,472.700012,472.840277
4,2023-12-22,4,473.649994,472.887226


In [24]:
# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line = df_close_predicted.hvplot.line(
    x = "dayspost121823",
    y = "close_predicted",
    color = "red"
)
best_fit_line

In [25]:
# Create a scatter plot with the closing price information
salary_plot = df_close_predicted.hvplot.scatter(
    x="dayspost121823",
    y="Close",
    title="SPY Daily Closing Price"
)
salary_plot

In [26]:
salary_plot * best_fit_line

In [27]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

In [28]:
# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.02079927739628651.
The r2 is 0.02079927739628651.
The mean squared error is 9.022078885370577.
The root mean squared error is 3.003677560153649.
The standard deviation is 3.0354106069192848.


The r-squared value is generally a good measure of how strong a linear association is between two variables. In more detail, it basically describe how much of the variation in the actual data (aka the true closing price of the SPY at a given x days after 12/18/2023) is accounted for by the linear regression model (represented in the best fit line in red). The close r-squared is to either -1 or 1, the more variation of the closing prices can be accounted for by the model, and the inverse is true if closer to 0.

In this model, the r-squared value is 0.02. This indicates that very little to none of the variation in the closing prices can be accounted for by the linear regression alone. Another test will be conducted with more data to determine if the r-squared value can be changed. It may also be necessary to split the data into training and testing data if sufficent data is provided (which should not be an issue since we can utilize up to 10 years of history if time allows).

In [29]:
# Linear regression: y = slope * x + y-intercept
# y = price
# x = days after start date
# slope = change in price/difference in days

In [32]:
# Data Set-Up

# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

# Obtaining data for SPY from the year 2023
data = yf.download("SPY", start = "2023-1-1", end = "2023-12-31")

# Creating a dataframe with only Date and Closing Price
close_df = data[["Close"]]

# Verifying data within dataframe
close_df.head()

# Resetting index to manipulate date column
close_df = close_df.reset_index()

# Verifying datatype of date
close_df.info()

# Saving earliest date for calculations
start_date = close_df["Date"][0]
print(start_date)

# Creating a new column (dayspost121823) for count of days after start_date
close_df['dayspost1123'] = close_df["Date"] - start_date
close_df

# Verifying datatype for new column
close_df.info()

# Converting new column datatype into integer
close_df['dayspost1123'] = close_df["dayspost1123"].dt.days
close_df

# Rearranging the dataframe
close_df = close_df[['Date', 'dayspost1123', 'Close']]
close_df

# Linear Regression Model Set-Up

# Reformat data of the independent variable X as a single-column array
X = close_df["dayspost1123"].values.reshape(-1, 1)

# Display sample data
X[:5]

# The shape of X should be 20 samples, with a single feature (column)
X.shape

# Create an array for the dependent variable y
y = close_df["Close"]

# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

# Linear Regression Model Predictions

# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

# Display the formula to predict the closing price of SPY after 1 week (7 days)
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of SPY after 1 week
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_closed_predicted = close_df.copy()

# Add a column with the predicted salary values
df_closed_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_closed_predicted.head()

# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line_2 = df_closed_predicted.hvplot.line(
    x = "dayspost1123",
    y = "close_predicted",
    color = "red"
)


# Create a scatter plot with the closing price information
salary_plot_2 = df_closed_predicted.hvplot.scatter(
    x="dayspost1123",
    y="Close",
    title="SPY Daily Closing Price"
)

# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

[*********************100%%**********************]  1 of 1 completed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    250 non-null    datetime64[ns]
 1   Close   250 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.0 KB
2023-01-03 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   Date          250 non-null    datetime64[ns] 
 1   Close         250 non-null    float64        
 2   dayspost1123  250 non-null    timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 6.0 KB
Model's slope: [0.18451151]
Model's y-intercept: 394.08353917248587
Model's formula: y = 0.1845115051391308X + 394.08353917248587
Model's formula: y = 0.1845115051391308 * 7 + 394.08353917248587
Predicte




In [33]:
#Superimpose scatter plot onto best fit line
salary_plot_2 * best_fit_line_2

In [34]:
# Data Set-Up

# importing dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import datetime as dt

# Obtaining data for SPY from the years 2021-2023
data = yf.download("SPY", start = "2021-1-1", end = "2023-12-31")

# Creating a dataframe with only Date and Closing Price
close_df = data[["Close"]]

# Verifying data within dataframe
close_df.head()

# Resetting index to manipulate date column
close_df = close_df.reset_index()

# Verifying datatype of date
close_df.info()

# Saving earliest date for calculations
start_date = close_df["Date"][0]
print(start_date)

# Creating a new column (dayspost121823) for count of days after start_date
close_df['dayspost1121'] = close_df["Date"] - start_date
close_df

# Verifying datatype for new column
close_df.info()

# Converting new column datatype into integer
close_df['dayspost1121'] = close_df["dayspost1121"].dt.days
close_df

# Rearranging the dataframe
close_df = close_df[['Date', 'dayspost1121', 'Close']]
close_df

# Linear Regression Model Set-Up

# Reformat data of the independent variable X as a single-column array
X = close_df["dayspost1121"].values.reshape(-1, 1)

# Display sample data
X[:5]

# The shape of X should be 20 samples, with a single feature (column)
X.shape

# Create an array for the dependent variable y
y = close_df["Close"]

# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

# Linear Regression Model Predictions

# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.coef_[0]}X + {model.intercept_}")

# Display the formula to predict the closing price of SPY after 1 week (7 days)
print(f"Model's formula: y = {model.coef_[0]} * 7 + {model.intercept_}")

# Predict the closing price of SPY after 1 week
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted closing price of SPY after 7 days: ${y_7:.2f}")

# Make predictions using the X set
predicted_y_values = model.predict(X)
# Create a copy of the original data
df_closed_predicted = close_df.copy()

# Add a column with the predicted salary values
df_closed_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_closed_predicted.head()

# Create a line plot of the predicted closing prices
import numpy as np
import hvplot.pandas
best_fit_line_3 = df_closed_predicted.hvplot.line(
    x = "dayspost1121",
    y = "close_predicted",
    color = "red"
)


# Create a scatter plot with the closing price information
salary_plot_3 = df_closed_predicted.hvplot.scatter(
    x="dayspost1121",
    y="Close",
    title="SPY Daily Closing Price"
)

# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

[*********************100%%**********************]  1 of 1 completed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    753 non-null    datetime64[ns]
 1   Close   753 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 11.9 KB
2021-01-04 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   Date          753 non-null    datetime64[ns] 
 1   Close         753 non-null    float64        
 2   dayspost1121  753 non-null    timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), timedelta64[ns](1)
memory usage: 17.8 KB
Model's slope: [0.00977546]
Model's y-intercept: 415.5414180177732
Model's formula: y = 0.009775461932527625X + 415.5414180177732
Model's formula: y = 0.009775461932527625 * 7 + 415.5414180177732
Predi




In [35]:
#Superimpose scatter plot onto best fit line
salary_plot_3 * best_fit_line_3