In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib

In [19]:
def extract_df(path, sma_period):
    real_df = pd.read_table(path)
    df = real_df.copy()
    # remove the following columns <TICKVOL>, <VOL> and <SPREAD>
    df = df.drop(['<SPREAD>', '<VOL>'], axis=1)
    df = df.rename(columns={'<DATE>': 'Date', 
                                    '<TIME>': 'Time', 
                                    '<OPEN>': 'Open', 
                                    '<HIGH>': 'High', 
                                    '<LOW>': 'Low', 
                                    '<CLOSE>': 'Close',
                                    '<TICKVOL>': 'Volume',
                                    })
    # combine the date and time columns
    df['Date_Time'] = df['Date'] + ' ' + df['Time']
    df = df.drop(['Date', 'Time'], axis=1)
    df['Time'] = pd.to_datetime(df['Date_Time'])
    df = df.drop(['Time'], axis=1)
    prices = df["Close"].values
    
    df["SMA"] = talib.SMA(prices, timeperiod=sma_period)
    
    df["MACD"], df["MACD_Signal"], df["MACD_Hist"] = talib.MACD(df["Close"])
    df["MACD_Crossover"] = np.where(df["MACD"] > df["MACD_Signal"], 1, -1)
    df["MACD_Crossover_Change"] = df["MACD_Crossover"].diff()
    
    # RSI (Relative Strength Index)
    df["RSI"] = talib.RSI(prices)
    
    # MACD (Moving Average Convergence Divergence)
    df["MACD"], df["MACD_Signal"], df["MACD_Hist"] = talib.MACD(df["Close"])
    
    # ATR (Average True Range)
    df["ATR"] = talib.ATR(df["High"], df["Low"], df["Close"])
    
    # ADX (Average Directional Index)
    df["ADX"] = talib.ADX(df["High"], df["Low"], df["Close"])
    
    # Aroon Oscillator
    df["AROON_Oscillator"] = talib.AROONOSC(df["High"], df["Low"])
    
    # OBV (On-Balance Volume)
    df["OBV"] = talib.OBV(df["Close"], df["Volume"])
    
    # CCI (Commodity Channel Index)
    df["CCI"] = talib.CCI(df["High"], df["Low"], df["Close"])
    
    # Parabolic SAR
    df["PSAR"] = talib.SAR(df["High"], df["Low"])
    
    # Stochastic Oscillator
    df["STOCH_K"], df["STOCH_D"] = talib.STOCH(df["High"], df["Low"], df["Close"])
    
    # Williams %R
    df["WILLR"] = talib.WILLR(df["High"], df["Low"], df["Close"])
    
    # Bollinger Bands
    df["BBANDS_Upper"], df["BBANDS_Middle"], df["BBANDS_Lower"] = talib.BBANDS(df["Close"])
    
    # AD (Chaikin A/D Line)
    df["AD"] = talib.AD(df["High"], df["Low"], df["Close"], df["Volume"])
    
    # ADOSC (Chaikin A/D Oscillator)
    df["ADOSC"] = talib.ADOSC(df["High"], df["Low"], df["Close"], df["Volume"])
    
    
    df = df.dropna()
    return df

eur_usd_h1_path = '/projects/genomic-ml/da2343/ml_project_2/data/EURUSD_H1_200702210000_202304242100.tsv'

sma_list = [30, 50, 100, 200]
for sma in sma_list:
    df = extract_df(eur_usd_h1_path, sma)
    # save the df to a csv file
    df.to_csv(f'/projects/genomic-ml/da2343/ml_project_2/data/EURUSD_H1_2007_2023_SMA_{sma}.csv', index=True)

In [20]:
df

Unnamed: 0,Open,High,Low,Close,Volume,Date_Time,SMA,MACD,MACD_Signal,MACD_Hist,...,CCI,PSAR,STOCH_K,STOCH_D,WILLR,BBANDS_Upper,BBANDS_Middle,BBANDS_Lower,AD,ADOSC
199,1.31170,1.31190,1.30840,1.30950,583,2007.03.05 12:00:00,1.316883,-0.001623,-0.001103,-0.000520,...,-167.527853,1.317897,13.212454,14.167962,-91.269841,1.315508,1.312280,1.309052,-2.608051e+03,-509.905307
200,1.30960,1.31040,1.30810,1.30850,364,2007.03.05 13:00:00,1.316854,-0.001905,-0.001263,-0.000642,...,-179.620035,1.317137,7.757055,10.759067,-96.899225,1.314926,1.311160,1.307394,-2.845442e+03,-562.879596
201,1.30860,1.30950,1.30740,1.30830,536,2007.03.05 14:00:00,1.316823,-0.002121,-0.001435,-0.000686,...,-168.147087,1.316234,11.731414,10.900308,-92.436975,1.314176,1.310260,1.306344,-2.922013e+03,-557.743660
202,1.30860,1.30990,1.30700,1.30950,550,2007.03.05 15:00:00,1.316799,-0.002170,-0.001582,-0.000588,...,-120.851157,1.315174,19.720832,13.069767,-76.190476,1.311821,1.309480,1.307139,-2.523737e+03,-378.214488
203,1.30960,1.31150,1.30960,1.30970,593,2007.03.05 16:00:00,1.316779,-0.002167,-0.001699,-0.000468,...,-62.080852,1.314029,36.210245,22.554164,-73.786408,1.310259,1.309100,1.307941,-3.054316e+03,-439.208175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100170,1.10237,1.10370,1.10207,1.10294,4538,2023.04.24 17:00:00,1.098075,0.001237,0.000860,0.000377,...,133.788223,1.097986,84.062477,82.909956,-10.689170,1.103770,1.101910,1.100050,-3.906272e+06,1882.860989
100171,1.10294,1.10323,1.10206,1.10260,3226,2023.04.24 18:00:00,1.098125,0.001299,0.000948,0.000351,...,103.659687,1.098557,75.925112,79.894754,-15.471167,1.104009,1.102100,1.100191,-3.906520e+06,1669.983383
100172,1.10260,1.10342,1.10249,1.10340,1878,2023.04.24 19:00:00,1.098180,0.001397,0.001037,0.000359,...,105.947382,1.099072,80.981586,80.323058,-4.219409,1.103525,1.102732,1.101939,-3.904723e+06,2002.924994
100173,1.10341,1.10484,1.10336,1.10441,1730,2023.04.24 20:00:00,1.098238,0.001538,0.001138,0.000400,...,129.290207,1.099534,83.616879,80.174526,-5.212121,1.104581,1.103150,1.101719,-3.903998e+06,2187.640112


In [6]:
only_2020_df = df[df['Date_Time'].str.contains('2020')]
only_2020_df

Unnamed: 0,Open,High,Low,Close,Date_Time,SMA,MACD,MACD_Signal,MACD_Hist,MACD_Crossover,MACD_Crossover_Change
79556,1.12132,1.12143,1.12008,1.12011,2020.01.02 06:00:00,1.113153,0.000446,0.000719,-0.000273,-1,0.0
79557,1.12011,1.12043,1.12008,1.12043,2020.01.02 07:00:00,1.113188,0.000308,0.000637,-0.000329,-1,0.0
79558,1.12043,1.12075,1.12037,1.12066,2020.01.02 08:00:00,1.113222,0.000215,0.000553,-0.000337,-1,0.0
79559,1.12074,1.12102,1.12043,1.12098,2020.01.02 09:00:00,1.113261,0.000165,0.000475,-0.000310,-1,0.0
79560,1.12098,1.12136,1.12052,1.12118,2020.01.02 10:00:00,1.113303,0.000140,0.000408,-0.000268,-1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
85777,1.22385,1.22404,1.22209,1.22300,2020.12.31 18:00:00,1.222752,-0.000779,-0.000067,-0.000712,-1,0.0
85778,1.22300,1.22347,1.22218,1.22253,2020.12.31 19:00:00,1.222738,-0.001069,-0.000268,-0.000801,-1,0.0
85779,1.22255,1.22280,1.22151,1.22168,2020.12.31 20:00:00,1.222723,-0.001351,-0.000484,-0.000867,-1,0.0
85780,1.22150,1.22189,1.22127,1.22154,2020.12.31 22:00:00,1.222702,-0.001568,-0.000701,-0.000867,-1,0.0
