# Introduction
This notebook is similar to the others (as of 5.2.2024)
but it puts multiple stocks into the training data in form of rows, not columns.

# Imports

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# Helper Functions

In [None]:
def getRelativeStockData(ticker, start, period, end="2024-2-2", interval="1d"):
    if start == None:
        data = yf.download(ticker, period=period, interval=interval)
    else:
        data = yf.download(ticker, start=start, end=end, interval=interval)

    # Making it relative to the previous data point with 1% = 0.01
    # All in a pandas dataframe

    # Only use the close price
    data = data["Close"]
    
    data = data.pct_change().dropna()
    data = data.reset_index()
    data = data.rename(columns={"index": "Date"})


    return data


def scrollData(data, contextWindow=10, solutionWindow=3):
    # Scrolling through the data and returning the context window and the solution window
    # Columns: Date, Solution1, Solution2, ..., Context0, Context-1, Context-2, ...

    # Create the columns for the context and solution windows
    for i in range(solutionWindow):
        data[f"Solution{i+1}"] = data["Close"].shift(-i)
    for i in range(contextWindow):
        data[f"Context{-i}"] = data["Close"].shift(i)

    # Drop the NaN values
    data = data.dropna()

    # Drop the Close column
    data = data.drop(columns=["Close"])

    # Fix the index
    data = data.reset_index(drop=True)

    return data


def unrelativizeDataArray(data):
    # Unrelativize the data
    # This is done by multiplying the previous data point with the current data point
    # This is done for all values except the first

    processedData = data.copy()

    # Its an array in an array for some reason (with predictions)
    if len(processedData) == 1:
        processedData = processedData[0]

    # Make the base (first value) 0
    processedData[0] = 0

    # Loop through the data and unrelativize it
    for i in range(len(processedData)):
        if i != 0:
            processedData[i] = (processedData[i] + 1) * (processedData[i-1] + 1) - 1

    return processedData

# Prepare Data

In [None]:
# Some Parameters
contextWindow=1000
solutionWindow=100

stocksForData = ["AAPL", "MSFT", "GOOGL", "FB", "AMZN", "IBM", "TSLA", "NFLX", 
    "NVDA", "INTC", 'GOLLQ', 'GSK', 'NWG', 'BIRK', 'IHG', 'TAK', 'BAC', 'BK', 'CWK', 'CI', 
    'STT', 'JPM', 'WASH', 'CL', 'WLY', 'HIG', 'C', 'YORW', 'BMO', 'FISI', 
    'BG', 'EBC', 'RBGLY', 'MO', 'KEY', 'WTW', 'CFG', 'M', 'BNS', 'ROG', 
    'MCK', 'IFF', 'CHMG', 'NABZY', 'ONB', 'HIFS', 'TMP', 'PG', 'DE', 
    'BRK.B', 'PFS', 'ADX', 'DNB', 'RYI', 'SWK', 'TRC', 'PSO', 'BC', 'CHD', 
    'BHLB', 'SIEGY', 'FNF', 'AGR', 'CLF', 'PUK', 'UNM', 'LAZ', 'PFE', 
    'ESLOY', 'CMA', 'DCO', 'AXP', 'CCU', 'MATW', 'BHP', 'TRI', 'GLW', 
    'NYT', 'DOLE', 'AROW', 'CMTV', 'WFC', 'PNC', 'THG', 'SWBI', 'BHRB', 
    'TRV', 'OTIS', 'CNA', 'VIVHY', 'LEVI', 'BLCO', 'NVRI', 'WNEB', 'TD', 
    'CR', 'CXT', 'MGEE', 'MTB', 'SNN', 'NBTB', 'SAN', 'BBVA', 'SR', 'B', 
    'ACNB', 'ITOCY', 'MARUY', 'FITB', 'FHB']

# Main Loop for getting more data in form of rows
for stockTicker in stocksForData:
    
    # Get Data for AAPL
    stockData = getRelativeStockData(stockTicker, start=None, period="max", interval="1d")

    # Check if enough Data available
    if len(stockData) < contextWindow + solutionWindow:
        print(f"Skipping {stockTicker} because there is not enough data")
        continue

    # Scroll the data
    stockData = scrollData(stockData, contextWindow=contextWindow, solutionWindow=solutionWindow)

    # Add the data to the main data
    if stockTicker == stocksForData[0]:
        data = stockData
    else:
        data = pd.concat([data, stockData], ignore_index=True)


    print(f"Getting Data for {stockTicker}")
    print(f"This one has {len(stockData)} rows")

In [None]:
# Split the data into training and testing data
train, test = train_test_split(data, test_size=0.1)

# Columns for the context and solution windows
contextColumns = [f"Context{-i}" for i in range(contextWindow)]
solutionColumns = [f"Solution{i+1}" for i in range(solutionWindow)]

# Split X and y
X_train = train[contextColumns]
y_train = train[solutionColumns]

X_test = test[contextColumns]
y_test = test[solutionColumns]

# Train the Model

In [None]:
# Making the MLP model
model = MLPRegressor(hidden_layer_sizes=(100), max_iter=200, verbose=True, learning_rate='adaptive', learning_rate_init=0.0001, early_stopping=True)
# model = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100), activation='relu', solver='adam', alpha=0.0001, learning_rate='adaptive', learning_rate_init=0.001, verbose=True)


# Train the model
model.fit(X_train, y_train)

# Test the Model

In [None]:
# Get data for AAPL
stockData = getRelativeStockData("B", start=None, period="5y", interval="1d")

# Scroll the data
stockData = scrollData(stockData, contextWindow=contextWindow, solutionWindow=solutionWindow)



# Get the x'th data point for testing
whereToPredict = 1 # points before end
contextToPredict = stockData[contextColumns][-1-whereToPredict:-whereToPredict]
solutionToPredict = stockData[solutionColumns][-1-whereToPredict:-whereToPredict]

# Make Prediction on that data point
predicitons = model.predict(contextToPredict)


# Make the data usefull (reversing etc.)
plottedContext = contextToPredict.values[0][::-1].reshape(-1, 1)
plottedSolution = solutionToPredict.values.reshape(-1, 1)
plottedPredictions = predicitons.reshape(-1, 1)

# Plot the data in green, red, blue
plt.plot(plottedContext, "blue")

x_values = range(contextWindow - 1, contextWindow + len(plottedSolution) - 1)
plt.plot(x_values, plottedSolution, "green")

x_values = range(contextWindow, contextWindow + len(plottedPredictions))
plt.plot(x_values, plottedPredictions, "red")

# Set the x-axis range to zoom from ... to ...
plt.xlim(len(unrelativizeDataArray(plottedContext))-len(unrelativizeDataArray(plottedPredictions))*1.1, len(unrelativizeDataArray(plottedContext)) + len(unrelativizeDataArray(plottedPredictions)) * 1.1)

plt.grid()

In [None]:
# Plot the data in green, red, blue
plt.plot(unrelativizeDataArray(plottedContext) - unrelativizeDataArray(plottedContext)[-1], "blue")

x_values = range(contextWindow - 1, contextWindow + len(plottedSolution) - 1)
y_values = unrelativizeDataArray(plottedSolution) * (unrelativizeDataArray(plottedContext)[-1] + 1)
plt.plot(x_values, y_values , "green")

x_values = range(contextWindow, contextWindow + len(plottedPredictions))
y_values = unrelativizeDataArray(plottedPredictions) * (unrelativizeDataArray(plottedContext)[-1] + 1)
plt.plot(x_values, y_values, "red")

# Set the x-axis range to zoom from ... to ...
xlim = len(unrelativizeDataArray(plottedContext))-len(unrelativizeDataArray(plottedPredictions))*1.1, len(unrelativizeDataArray(plottedContext)) + len(unrelativizeDataArray(plottedPredictions)) * 1.1
plt.xlim(xlim)

# Set the y-axis range to zoom from the minimum to the maximum of all values visible in the plot
allValuesInPlot = np.append(unrelativizeDataArray(plottedPredictions), unrelativizeDataArray(plottedSolution))
allValuesInPlot = np.append(allValuesInPlot, (unrelativizeDataArray(plottedContext) - unrelativizeDataArray(plottedContext)[-1])[int(xlim[0]):int(xlim[1])])

# plt.ylim(min(allValuesInPlot)*1.1, max(allValuesInPlot)*1.1)

plt.grid()

# Calculating some fancy stats

In [None]:
# Average percent error calculation
# First we'll calculate the percentage error between all the predictions and the actual values
# Then we'll calculate the average of those percentage errors

# Average absolute error calculation
# First we'll calculate the absolute error between all the predictions and the actual values
# Then we'll calculate the average of those absolute errors

avgPercentErrors = []
avgAbsoluteErrors = []


for i in range(len(plottedPredictions)):
    percentError = abs((plottedPredictions[i] - plottedSolution[i]) / plottedSolution[i]) # 1% = 0.01
    avgPercentErrors.append(percentError)

    absoluteError = abs(plottedPredictions[i] - plottedSolution[i])
    avgAbsoluteErrors.append(absoluteError)

In [None]:
# Calculate the average of the errors
avgPercentError = (sum(avgPercentErrors) / len(avgPercentErrors))[0]
avgAbsoluteError = (sum(avgAbsoluteErrors) / len(avgAbsoluteErrors))[0]

In [None]:
# Sum up with print statements
print(f"Average percent error: {round(avgPercentError*100, 2)}%")
print(f"Average absolute error: {round(avgAbsoluteError*100, 2)}%")

# Some actually professional measures

In [None]:
# Predict the Testing Data
predicitons = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, predicitons)

# Print the mean squared error
print(f"Mean Squared Error: {mse}")