# Discovery

## Extract

In [3]:
import os
from typing import Optional
from datetime import datetime, timedelta
import pandas as pd
import yfinance as yf

class Extract():
    def __init__(self, ticker_list, years_bf: Optional[int] = 0, months_bf: Optional[int] = 0, days_bf: Optional[int] = 0):
        self.ticker_list = ticker_list
        self.end = datetime.now()
        self.start = datetime(self.end.year - years_bf, self.end.month - months_bf, self.end.day - days_bf)
        self.load_path = 'data/'

    def _read(self):
        self.data = {}
        files = os.listdir(self.load_path)
        self.stocks_data = [i[:-26] for i in files]

        for i in list(set(self.stocks_data) & set(self.ticker_list)):
            date1 = files[self.stocks_data.index(i)][-14:-4]
            start_date_file = datetime(int(date1[:4]), int(date1[5:7]), int(date1[8:10]))
            date2 = files[self.stocks_data.index(i)][-25:-15]
            end_date_file = datetime(int(date2[:4]), int(date2[5:7]), int(date2[8:10]))

            start_date_inter = max(start_date_file, self.start)
            end_date_inter = min(end_date_file, self.end)

            if start_date_inter <= end_date_inter:
                if self.start > start_date_file and self.end > end_date_inter:
                    serie = pd.concat([pd.read_csv(os.path.join(self.load_path, files[self.stocks_data.index(i)])).set_index('Date')['Adj Close'],
                                       yf.download(i, end_date_inter + timedelta(days=1), self.end)['Adj Close']])
                    serie.index = pd.to_datetime(serie.index)
                    self.data[i] = serie.loc[self.start:self.end]
                    self.ticker_list.remove(i)

        for file in [i for i in files if '.ipynb_checkpoints' not in i]:
            os.remove(os.path.join(self.load_path, file))

    def _load(self):
        for stock in self.data.keys():
            self.data[stock].to_csv(os.path.join(self.load_path, f"{stock}_{self.data[stock].index.max().date()}_{self.data[stock].index.min().date()}.csv"))

    def run(self):
        self._read()
        for stock in self.ticker_list:
            self.data[stock] = yf.download(stock, self.start, self.end)['Adj Close']
        self._load()
        return self.data


In [4]:
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN']

extract = Extract(tech_list, years_bf=12)

In [5]:
stocks_hist = extract.run()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


## Transform

In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

class Transform():
  def __init__(self, data, test_size, X_days):
    self.data = data
    self.test_size = test_size
    self.X_days = X_days

  def _split_train_test(self):
    self.train={}
    self.test={}

    for ticker in self.data.keys():
      dataset = self.data[ticker].values
      training_data_len = int(np.ceil( len(dataset) * (1-self.test_size)))
      self.train[ticker] = dataset[:int(training_data_len)]
      self.test[ticker] = dataset[int(training_data_len):]

  def _scaler(self):
    self.scaler={}

    for ticker in self.data.keys():
      self.scaler[ticker] = MinMaxScaler(feature_range=(0,1))
      self.train[ticker] = self.scaler[ticker].fit_transform(self.train[ticker].reshape(-1, 1))
      self.test[ticker] = self.scaler[ticker].transform(self.test[ticker].reshape(-1, 1))

  def _split_X_y(self):

    self.X_train={}
    self.y_train={}
    self.X_test={}
    self.y_test={}

    for ticker in self.data.keys():
      self.X_train[ticker]=[]
      self.y_train[ticker]=[]

      for i in range(self.X_days, len(list(self.train.values())[0])):

        self.X_train[ticker].append(self.train[ticker][i-self.X_days:i, 0])
        self.y_train[ticker].append(self.train[ticker][i, 0])

      self.X_train[ticker], self.y_train[ticker] = np.array(self.X_train[ticker]), np.array(self.y_train[ticker])
      self.X_train[ticker] = np.reshape(self.X_train[ticker], (self.X_train[ticker].shape[0], self.X_train[ticker].shape[1], 1))

      self.X_test[ticker]=[]
      self.y_test[ticker]=[]

      for i in range(self.X_days, len(list(self.test.values())[0])):

        self.X_test[ticker].append(self.test[ticker][i-self.X_days:i, 0])
        self.y_test[ticker].append(self.test[ticker][i, 0])

      self.X_test[ticker], self.y_test[ticker] = np.array(self.X_test[ticker]), np.array(self.y_test[ticker])
      self.X_test[ticker] = np.reshape(self.X_test[ticker], (self.X_test[ticker].shape[0], self.X_test[ticker].shape[1], 1))

  def run(self):
    self._split_train_test()
    self._scaler()
    self._split_X_y()

    return self.X_train, self.y_train, self.X_test, self.y_test, self.scaler

In [7]:
transform = Transform(stocks_hist, test_size=.05, X_days=60)

In [8]:
X_train, y_train, X_test, y_test, scaler = transform.run()

## Model

In [55]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

class Model():
  def __init__(self, X_train, y_train, X_test, y_test, scaler):
    self.X_train, self.y_train, self.X_test, self.y_test = X_train, y_train, X_test, y_test
    self.scaler = scaler

  def fit(self):
    self.lstm={}
    for ticker in self.X_train.keys():
      self.lstm[ticker] = Sequential()
      self.lstm[ticker].add(LSTM(128, return_sequences=True, input_shape= (self.X_train[ticker].shape[1], 1)))
      self.lstm[ticker].add(LSTM(64, return_sequences=False))
      self.lstm[ticker].add(Dense(25))
      self.lstm[ticker].add(Dense(1))

      # Compile the model
      self.lstm[ticker].compile(optimizer='adam', loss='mean_squared_error')

      # Train the model
      self.lstm[ticker].fit(self.X_train[ticker], self.y_train[ticker], batch_size=1, epochs=1)

  def _risk(self):
    self.risk={}
    for ticker in self.X_test.keys():
      self.risk[ticker]=np.std((np.diff(self.X_test[ticker][:,:,0])/self.X_test[ticker][:,0:-1,0]), axis=1).mean()

  def predict(self):
    self.predictions={}
    self.label={}

    for ticker in self.X_test.keys():
      self.predictions[ticker] = self.lstm[ticker].predict(self.X_test[ticker])
      self.predictions[ticker] = self.scaler[ticker].inverse_transform(self.predictions[ticker])

    for ticker in self.y_test.keys():
      self.label[ticker] = self.scaler[ticker].inverse_transform(self.y_test[ticker].reshape(-1, 1))

    self._risk()

    return self.label, self.predictions, self.risk

In [56]:
lstm = Model(X_train, y_train, X_test, y_test, scaler)

In [57]:
lstm.fit()



In [58]:
 label, predictions, risk = lstm.predict()









## Metrics

In [96]:
from sklearn.metrics import r2_score

r2={}

for ticker in predictions.keys():
  r2[ticker] = r2_score(label[ticker], predictions[ticker])

In [106]:
for ticker in X_test.keys():
  data_s = scaler[ticker].transform(stocks_hist[ticker].tail(60).values.reshape(-1, 1))
  pred_s = lstm.lstm[ticker].predict(data_s.reshape(1,-1, 1))
  pred = scaler[ticker].inverse_transform(pred_s.reshape(-1, 1))
  print(ticker)
  print("Today: " + str(round(stocks_hist[ticker].iloc[-1])))
  print("Tomorrow: " + str(round(pred[0][0])))
  print("Price change: " + str(round(((pred[0][0]-stocks_hist[ticker].iloc[-1])/stocks_hist[ticker].iloc[-1])*100, 2)) + "%")
  print("Risk: " + str(round(risk[ticker], 4)))
  print("R2: " + str(round(r2[ticker], 4)))



AAPL
Today: 173
Tomorrow: 179
Price change: 3.62%
Risk: 0.0121
R2: -0.0142
GOOG
Today: 142
Tomorrow: 148
Price change: 3.99%
Risk: 0.0193
R2: -0.5777
MSFT
Today: 416
Tomorrow: 371
Price change: -10.8%
Risk: 0.0129
R2: -4.1765
AMZN
Today: 174
Tomorrow: 187
Price change: 7.3%
Risk: 0.0193
R2: 0.4017
