# Notebook for learning from data coursework 2022

In [483]:
# import our libraries that we will be using to investigate the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures

## load out csv data into a pandas dataframe so that we can manipulate it

In [484]:
dataframe = pd.read_csv("AT_T_SBC_Stock_Price_Data_2000_2020.csv")

# filter out the columns we dont care about
dataframe["LIQ"] = dataframe["ASKHI"] - dataframe["BIDLO"]
important_cols = ["DATE", "OPENPRC", "VOL", "BIDLO", "ASKHI", "PRC", "LIQ"]
dataframe = dataframe[important_cols]


In [485]:
def gen_model(model, data, cols, time_series_past, time_series_future, normalize=False, polynomial=0, show_plot=False):
    
    data = data.copy()

    #col_selection = cols_to_include.copy()
    col_selection = ["DATE"]
    for i in range(1, time_series_past):
        for col in cols:
            data[col + str(i) + "DAY"] = data[col].shift(periods=i)
            col_selection.append(col + str(i) + "DAY")

    data["PRC" + str(time_series_future) + "DAY"] = data["PRC"].shift(-time_series_future)

    if time_series_future == 0:
        cleaned_data = data.iloc[time_series_past:]
    else:
        cleaned_data = data.iloc[time_series_past:-time_series_future]

    if not polynomial == 0:
        poly = PolynomialFeatures(degree=polynomial, include_bias=False)
        poly.fit_transform(cleaned_data[cols])
        features = poly.transform(cleaned_data[cols])
        headers = poly.get_feature_names(cleaned_data[cols])
        X = features
    else:
        X = cleaned_data[col_selection]
    if time_series_future == 0:
        Y = cleaned_data["PRC"]
    else:
        Y = cleaned_data["PRC" + str(time_series_future) + "DAY"]

    if normalize:
        X = (X - X.mean()) / X.std()

    #x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

    #model.fit(x_train, y_train)

    # predict our testing data
    #predictions = model.predict(x_test)
    predictions = cross_val_predict(model, X, Y)
    print(len(predictions))

    total_error = 0

    # itterate our predictions and calculate the mean square error
    for i in range(len(predictions) - 1):
        total_error += pow(predictions[i] - Y.array[i], 2)

    if show_plot:
        print("number of predictions:", len(Y))

        if polynomial == 0:
            plt.plot(X["DATE"], predictions, "bo", markersize=0.3)
            plt.plot(X["DATE"], Y, "ro", markersize=0.3)
        else:
            plt.plot(X.transpose()[0], predictions, "bo", markersize=0.3)
            plt.plot(X.transpose()[0], Y, "ro", markersize=0.3)
        plt.title(str(model) + " predicting " + str(time_series_future) + " days into the future")
        plt.show()

    # check our mean square error for our models
    MSE = total_error / len(Y)
    print("total error squared: ", total_error, "MSE:", MSE)

    cross_val_linear = cross_val_score(model, X, Y)
    print("cross validation score:", cross_val_linear)

    return MSE

## take a look at the results when we include all the important data

In [486]:
linear_all_props = gen_model(LinearRegression(), dataframe, important_cols, time_series_past=5, time_series_future=0)
lasso_all_props = gen_model(Lasso(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=0)
ridge_all_props = gen_model(Ridge(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=0)

5282
total error squared:  1655.843987872449 MSE: 0.31348807040371995
cross validation score: [0.99209734 0.996042   0.97715208 0.9811265  0.98390535]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


5282
total error squared:  2109.6915365632067 MSE: 0.39941149878137194


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


cross validation score: [0.98881913 0.99493425 0.97440667 0.97878125 0.98331347]
5282
total error squared:  1656.5301265454602 MSE: 0.3136179717049338
cross validation score: [0.99206128 0.99611783 0.97702995 0.98117353 0.98394013]


## check if reducing the amount of data will change our prediction quality

In [487]:
cols_to_include = ["PRC", "VOL", "LIQ"]
linear_selected_props = gen_model(LinearRegression(), dataframe, cols_to_include, time_series_past=5, time_series_future=0)
lasso_selected_props = gen_model(Lasso(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=0)
ridge_selected_props = gen_model(Ridge(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=0)

5282
total error squared:  1660.1024169742782 MSE: 0.31429428568236994
cross validation score: [0.99208827 0.99608418 0.97688807 0.98119053 0.98373387]


  model = cd_fast.enet_coordinate_descent(


5282
total error squared:  2166.5458306268947 MSE: 0.4101752803155802


  model = cd_fast.enet_coordinate_descent(


cross validation score: [0.98820077 0.9948172  0.97457345 0.97975279 0.98370015]
5282
total error squared:  1660.1748822391332 MSE: 0.3143080049676511
cross validation score: [0.99208702 0.9960841  0.97689021 0.98119086 0.98373618]


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


## selected columns with regularisation

In [488]:
linear_selected_props_normal = gen_model(LinearRegression(), dataframe, important_cols, time_series_past=5, time_series_future=10, normalize=True)
lasso_selected_props_normal = gen_model(Lasso(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=10, normalize=True)
ridge_selected_props_normal = gen_model(Ridge(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=10, normalize=True)

5272
total error squared:  14019.365393062471 MSE: 2.659211948608208
cross validation score: [0.93436674 0.96403149 0.82684457 0.82719479 0.85465384]
5272
total error squared:  24696.843765643327 MSE: 4.68453030456057
cross validation score: [0.86421236 0.92900256 0.73070787 0.75246994 0.84708448]
5272
total error squared:  14013.17911505408 MSE: 2.6580385271346887
cross validation score: [0.93432055 0.96381    0.8252407  0.82767896 0.85658118]


## can we predict more into the future than one day?

In [489]:
# 10 days into the future
linear_10_days_all = gen_model(LinearRegression(), dataframe, important_cols, time_series_past=5, time_series_future=10)
lasso_10_days_all = gen_model(Lasso(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=10)
ridge_10_days_all = gen_model(Ridge(max_iter=3000), dataframe, important_cols, time_series_past=5, time_series_future=10)

linear_10_days = gen_model(LinearRegression(), dataframe, cols_to_include, time_series_past=5, time_series_future=10)
lasso_10_days = gen_model(Lasso(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=10)
ridge_10_days = gen_model(Ridge(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=10)

linear_10_days_normal = gen_model(LinearRegression(), dataframe, cols_to_include, time_series_past=5, time_series_future=10, normalize=True)
lasso_10_days_normal = gen_model(Lasso(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=10, normalize=True)
ridge_10_days_normal = gen_model(Ridge(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=10, normalize=True)

5272
total error squared:  14019.365393140282 MSE: 2.659211948622967
cross validation score: [0.93436674 0.96403149 0.82684457 0.82719479 0.85465384]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


5272
total error squared:  14647.129559774792 MSE: 2.7782870940392246


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


cross validation score: [0.92993753 0.96190646 0.82166076 0.82201095 0.85620994]
5272
total error squared:  14018.592890553364 MSE: 2.6590654193007137
cross validation score: [0.93436861 0.96403326 0.8268444  0.82718968 0.85468354]
5272
total error squared:  14038.2128671112 MSE: 2.6627869626538696
cross validation score: [0.93414167 0.96340385 0.82326372 0.82702264 0.85863236]
5272
total error squared:  14647.28043364231 MSE: 2.778315711995886
cross validation score: [0.92999321 0.96149008 0.81959307 0.82158497 0.85826133]
5272
total error squared:  14037.93998276176 MSE: 2.6627352015860697
cross validation score: [0.93414436 0.9634034  0.82327052 0.82702196 0.85862906]
5272
total error squared:  14038.212865409034 MSE: 2.6627869623310003


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


cross validation score: [0.93414167 0.96340385 0.82326372 0.82702264 0.85863236]
5272
total error squared:  23744.322338354876 MSE: 4.503854768276722
cross validation score: [0.87196394 0.92972719 0.73151888 0.76155623 0.84832185]
5272
total error squared:  14035.987070263545 MSE: 2.6623647705355737
cross validation score: [0.93417819 0.96337558 0.82339995 0.82700435 0.85854036]


## investigate ridge regression

In [490]:
ridge_10_day = gen_model(Ridge(max_iter=3000), dataframe, cols_to_include, time_series_past=5, time_series_future=10)

5272
total error squared:  14037.93998276176 MSE: 2.6627352015860697
cross validation score: [0.93414436 0.9634034  0.82327052 0.82702196 0.85862906]


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


## using only the price of the previous dates

In [491]:
lasso_only_100_days = gen_model(Lasso(max_iter=3000), dataframe, ["PRC"], time_series_past=5, time_series_future=100)

5182
total error squared:  85303.79333720515 MSE: 16.461557957777913
cross validation score: [ 0.67815323  0.72109681 -0.08860718 -0.34213987 -0.00564069]


## mean squared error summary

In [492]:
print("Mean Squared Errors (Lower is better)")
print("column selection: ", cols_to_include)
print("Linear with all properties: ", linear_all_props)
print("Lasso with all properties: ", lasso_all_props)
print("Ridge with all properties: ", ridge_all_props)
print()
print("Linear with selected properties: ", linear_selected_props)
print("Lasso with selected properties: ", lasso_selected_props)
print("Ridge with selected properties: ", ridge_selected_props)
print()
print("Linear with selected properties normalized: ", linear_selected_props_normal)
print("Lasso with selected properties normalized: ", lasso_selected_props_normal)
print("Ridge with selected properties normalized: ", ridge_selected_props_normal)
print()
print("Linear with all properties 10 days: ", linear_10_days_all)
print("Lasso with all properties 10 days: ", lasso_10_days_all)
print("Ridge with all properties 10 days: ", ridge_10_days_all)
print("Linear with selected properties 10 days: ", linear_10_days)
print("Lasso with selected properties 10 days: ", lasso_10_days)
print("Ridge with selected properties 10 days: ", ridge_10_days)
print("Linear with selected properties 10 days normalized: ", linear_10_days_normal)
print("Lasso with selected properties 10 days normalized: ", lasso_10_days_normal)
print("Ridge with selected properties 10 days normalized: ", ridge_10_days_normal)
print()

Mean Squared Errors (Lower is better)
column selection:  ['PRC', 'VOL', 'LIQ']
Linear with all properties:  0.31348807040371995
Lasso with all properties:  0.39941149878137194
Ridge with all properties:  0.3136179717049338

Linear with selected properties:  0.31429428568236994
Lasso with selected properties:  0.4101752803155802
Ridge with selected properties:  0.3143080049676511

Linear with selected properties normalized:  2.659211948608208
Lasso with selected properties normalized:  4.68453030456057
Ridge with selected properties normalized:  2.6580385271346887

Linear with all properties 10 days:  2.659211948622967
Lasso with all properties 10 days:  2.7782870940392246
Ridge with all properties 10 days:  2.6590654193007137
Linear with selected properties 10 days:  2.6627869626538696
Lasso with selected properties 10 days:  2.778315711995886
Ridge with selected properties 10 days:  2.6627352015860697
Linear with selected properties 10 days normalized:  2.6627869623310003
Lasso with s