# Part 1. Install Packages

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks

In [2]:
%cd D:\Learning\University\UIT\AI\Projects\AI4Finance\AI-Project\src

d:\Learning\University\UIT\AI\Projects\AI4Finance\AI-Project\src


In [3]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from preprocessor.yahoodownloader import YahooDownloader
from preprocessor.preprocessors import FeatureEngineer, data_split
from config import config_tickers
from config.config import INDICATORS

import itertools

In [4]:
from config.config import (
    DATA_SAVE_DIR,
    TRAINED_MODEL_DIR,
    TENSORBOARD_LOG_DIR,
    RESULTS_DIR,
    INDICATORS,
    TRAIN_START_DATE,
    TRAIN_END_DATE,
    TEST_START_DATE,
    TEST_END_DATE,
    TRADE_START_DATE,
    TRADE_END_DATE,
    check_and_make_directories
)
from config.config_tickers import DOW_30_TICKER

check_and_make_directories([DATA_SAVE_DIR])

In [5]:
print(f"Train date: {TRAIN_START_DATE} - {TRAIN_END_DATE}")
print(f"Trade date: {TRADE_START_DATE} - {TRADE_END_DATE}")

Train date: 2015-01-01 - 2020-01-01
Trade date: 2022-01-01 - 2024-06-01


# Part 2. Fetch data
Yahoo Finance is a website that provides stock data, financial news, financial reports, etc. All the data provided by Yahoo Finance is free.
* We uses a class **YahooDownloader** to fetch data from Yahoo Finance API (taken from FinRL)
* Call Limit: Using the Public API (without authentication), you are limited to 2,000 requests per hour per IP (or up to a total of 48,000 requests a day).


In [6]:
df_raw = YahooDownloader(start_date = TRAIN_START_DATE,
                     end_date = TRADE_END_DATE,
                     ticker_list = config_tickers.DOW_30_TICKER).fetch_data()

-----------DOWNLOADING-----------
From = 2015-01-01 to 2024-06-01.


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Shape of DataFrame:  (70011, 8)


# Part 3. Processing Data
Data preprocessing is a crucial step for training a high quality machine learning model. We need to check for missing data and do feature engineering in order to convert the data into a model-ready state.
* Add technical indicators. In practical trading, various information needs to be taken into account, for example the historical stock prices, current holding shares, technical indicators, etc. In this article, we demonstrate two trend-following technical indicators: MACD and RSI.
* Add turbulence index. Risk-aversion reflects whether an investor will choose to preserve the capital. It also influences one's trading strategy when facing different market volatility level. To control the risk in a worst-case scenario, such as financial crisis of 2007–2008, FinRL employs the financial turbulence index that measures extreme asset price fluctuation.

In [7]:
INDICATORS = ['macd',
            'rsi_30',
            'cci_30',
            'dx_30']

In [8]:
fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list = INDICATORS,
                     use_vix=True,
                     use_turbulence=True,
                     user_defined_feature = False)

processed = fe.preprocess_data(df_raw)

Successfully added technical indicators
-----------DOWNLOADING-----------
From = 2015-01-02 to 2024-05-31.


[*********************100%%**********************]  1 of 1 completed


Shape of DataFrame:  (2368, 8)
Successfully added vix
Successfully added turbulence index


In [9]:
processed_full = fe.full_process(processed)
processed_full.head()

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,rsi_30,cci_30,dx_30,vix,turbulence
0,2015-01-02,AAPL,27.8475,27.860001,26.8375,24.402172,212818400.0,4.0,0.0,0.0,-66.666667,100.0,17.790001,0.0
1,2015-01-02,AMGN,160.160004,162.589996,158.600006,122.898552,2605400.0,4.0,0.0,0.0,-66.666667,100.0,17.790001,0.0
2,2015-01-02,AXP,93.169998,93.940002,92.139999,80.772018,2437500.0,4.0,0.0,0.0,-66.666667,100.0,17.790001,0.0
3,2015-01-02,BA,131.070007,131.839996,129.089996,113.657211,4294200.0,4.0,0.0,0.0,-66.666667,100.0,17.790001,0.0
4,2015-01-02,CAT,91.769997,92.370003,90.660004,70.907639,3767900.0,4.0,0.0,0.0,-66.666667,100.0,17.790001,0.0


# Part 4. Save the Data

In [10]:
print(len(processed_full))
processed_full.to_csv(DATA_SAVE_DIR + 'processed_full.csv')

68672
