# Notebook for learning from data coursework 2022

In [64]:
# import our libraries that we will be using to investigate the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

## load out csv data into a pandas dataframe so that we can manipulate it

In [65]:
dataframe = pd.read_csv("AT_T_SBC_Stock_Price_Data_2000_2020.csv")

## set up some variables that we will use to pre-process our data

In [66]:
# set the depth of our time series here
time_series_depth = 5

# add all the extra data into each column
important_cols = ["DATE", "OPENPRC", "VOL", "BIDLO", "ASKHI", "PRC"]

for i in range(1, time_series_depth):
    for col in important_cols:
        dataframe[col + str(i) + "DAY"] = dataframe[col].shift(periods=i)


## take a look at the results when we include all the important data

In [67]:

# construct the input data to the machine learning model
#col_selection = important_cols.copy()
col_selection = []
for i in range(1, time_series_depth):
    for col in important_cols:
        col_selection.append(col + str(i) + "DAY")

#print(dataframe)

#remove the first x data points because they will be incomplete
cleaned_data = dataframe.iloc[time_series_depth:]

X = cleaned_data[col_selection]
Y = cleaned_data["PRC"]
    
reg = LinearRegression()
lasso = Lasso(max_iter=2_000)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

# train out models
reg.fit(x_train, y_train)
lasso.fit(x_train, y_train)

# predict our testing data
linear_predictions = reg.predict(x_test)
lasso_predictions = lasso.predict(x_test)

total_error_linear = 0
total_error_lasso = 0

# itterate our predictions and calculate the mean square error
for i in range(len(linear_predictions)):
    print("prediction linear:", linear_predictions[i], "actual:", y_test.array[i])
    print("prediction lasso:", lasso_predictions[i], "actual:", y_test.array[i])
    total_error_linear += pow(linear_predictions[i] - y_test.array[i], 2)
    total_error_lasso += pow(lasso_predictions[i] - y_test.array[i], 2)

# check our mean square error for our models
print("total error squared linear: ", total_error_linear, "MSE:", total_error_linear / len(y_test))
print("total error squared lasso: ", total_error_lasso, "MSE:", total_error_lasso / len(y_test))

prediction linear: 30.659464557170985 actual: 30.68
prediction lasso: 30.505517819426103 actual: 30.68
prediction linear: 26.256483856290718 actual: 26.27
prediction lasso: 26.455691639311084 actual: 26.27
prediction linear: 34.55686639308669 actual: 34.95
prediction lasso: 34.52456318737459 actual: 34.95
prediction linear: 26.18791436516801 actual: 26.06
prediction lasso: 26.299447332222854 actual: 26.06
prediction linear: 39.986574784586935 actual: 39.96
prediction lasso: 39.922830720411454 actual: 39.96
prediction linear: 33.95961148396798 actual: 34.0
prediction lasso: 34.03787406870259 actual: 34.0
prediction linear: 36.915282534985685 actual: 36.98
prediction lasso: 36.80578547360365 actual: 36.98
prediction linear: 28.55653274123839 actual: 28.63
prediction lasso: 28.62868600825059 actual: 28.63
prediction linear: 26.087287655412474 actual: 25.78
prediction lasso: 26.23408421739223 actual: 25.78
prediction linear: 37.68300098185239 actual: 37.5
prediction lasso: 37.4478259427549

  model = cd_fast.enet_coordinate_descent(


## check if reducing the amount of data will change our prediction quality

In [68]:
cols_to_include = ["PRC", "VOL", "BIDLO"]
#col_selection = cols_to_include.copy()
col_selection = []
for i in range(1, time_series_depth):
    for col in cols_to_include:
        col_selection.append(col + str(i) + "DAY")

cleaned_data = dataframe.iloc[time_series_depth:]

X = cleaned_data[col_selection]
Y = cleaned_data["PRC"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

reg.fit(x_train, y_train)
lasso.fit(x_train, y_train)

# predict our testing data
linear_predictions = reg.predict(x_test)
lasso_predictions = lasso.predict(x_test)

total_error_linear = 0
total_error_lasso = 0

# itterate our predictions and calculate the mean square error
for i in range(len(linear_predictions)):
    print("prediction linear:", linear_predictions[i], "actual:", y_test.array[i])
    print("prediction lasso:", lasso_predictions[i], "actual:", y_test.array[i])
    total_error_linear += pow(linear_predictions[i] - y_test.array[i], 2)
    total_error_lasso += pow(lasso_predictions[i] - y_test.array[i], 2)

print("number of predictions:", len(y_test))

# check our mean square error for our models
print("total error squared linear: ", total_error_linear, "MSE:", total_error_linear / len(y_test))
print("total error squared lasso: ", total_error_lasso, "MSE:", total_error_lasso / len(y_test))

prediction linear: 39.226103966247024 actual: 38.53
prediction lasso: 39.03757107884712 actual: 38.53
prediction linear: 24.969377651745084 actual: 25.0
prediction lasso: 25.172728701840953 actual: 25.0
prediction linear: 29.599465832435193 actual: 29.71
prediction lasso: 29.65083795503913 actual: 29.71
prediction linear: 44.197802517533674 actual: 45.25
prediction lasso: 44.09576969825012 actual: 45.25
prediction linear: 39.99974747625037 actual: 40.39
prediction lasso: 39.821068300706294 actual: 40.39
prediction linear: 32.577537014306216 actual: 32.14
prediction lasso: 32.62512026828587 actual: 32.14
prediction linear: 27.907215533717658 actual: 27.72
prediction lasso: 27.97019769963905 actual: 27.72
prediction linear: 35.434309580101335 actual: 36.51
prediction lasso: 35.370910103450214 actual: 36.51
prediction linear: 31.355941491458985 actual: 31.35
prediction lasso: 31.339438498438536 actual: 31.35
prediction linear: 34.88764157125455 actual: 36.25
prediction lasso: 34.870508674

## can we predict more into the future than one day?

In [69]:
# 10 days into the future
time_series_future = 100

for i in range(1, time_series_future):
    dataframe["PRC" + str(time_series_future) + "DAY"] = dataframe["PRC"].shift(-i)

cols_to_include = ["PRC", "VOL", "BIDLO"]
#col_selection = cols_to_include.copy()
col_selection = []
for i in range(1, time_series_depth):
    for col in cols_to_include:
        col_selection.append(col + str(i) + "DAY")

cleaned_data = dataframe.iloc[time_series_depth:-time_series_future]

X = cleaned_data[col_selection]
Y = cleaned_data["PRC" + str(time_series_future) + "DAY"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

reg.fit(x_train, y_train)
lasso.fit(x_train, y_train)

# predict our testing data
linear_predictions = reg.predict(x_test)
lasso_predictions = lasso.predict(x_test)

total_error_linear = 0
total_error_lasso = 0

# itterate our predictions and calculate the mean square error
for i in range(len(linear_predictions)):
    print("prediction linear:", linear_predictions[i], "actual:", y_test.array[i])
    print("prediction lasso:", lasso_predictions[i], "actual:", y_test.array[i])
    total_error_linear += pow(linear_predictions[i] - y_test.array[i], 2)
    total_error_lasso += pow(lasso_predictions[i] - y_test.array[i], 2)

print("number of predictions:", len(y_test))

# check our mean square error for our models
print("total error squared linear: ", total_error_linear, "MSE:", total_error_linear / len(y_test))
print("total error squared lasso: ", total_error_lasso, "MSE:", total_error_lasso / len(y_test))

prediction linear: 26.41205049769727 actual: 28.54
prediction lasso: 26.512901710626466 actual: 28.54
prediction linear: 37.79764342498511 actual: 37.3
prediction lasso: 38.00238885326914 actual: 37.3
prediction linear: 29.676500298642875 actual: 22.59
prediction lasso: 31.04660066586353 actual: 22.59
prediction linear: 35.47854546191246 actual: 33.91
prediction lasso: 35.050250920752596 actual: 33.91
prediction linear: 35.321965580122196 actual: 30.65
prediction lasso: 35.466533417054386 actual: 30.65
prediction linear: 32.64103508482191 actual: 35.5
prediction lasso: 32.43672665978803 actual: 35.5
prediction linear: 43.48201655836395 actual: 43.09
prediction lasso: 43.36587370325343 actual: 43.09
prediction linear: 41.262739464202 actual: 37.79
prediction lasso: 40.68975383203188 actual: 37.79
prediction linear: 33.617066644494905 actual: 35.65
prediction lasso: 33.55882226372779 actual: 35.65
prediction linear: 32.783281724825684 actual: 36.64
prediction lasso: 32.530813827212015 ac

  model = cd_fast.enet_coordinate_descent(
