In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib
import pandas_ta as ta
import os

In [3]:

folders_path = "/projects/genomic-ml/da2343/ml_project_2/data"
data_file_paths = []

# loop through all folders in the path
for folder in os.listdir(folders_path):
    # check if the folder is a directory
    if os.path.isdir(os.path.join(folders_path, folder)) and folder != "EURUSD":
        current_folder_path = os.path.join(folders_path, folder)
        data_file_path = os.path.join(current_folder_path, os.listdir(current_folder_path)[0])
        data_file_paths.append(data_file_path)

In [4]:
def extract_df(path):
    real_df = pd.read_table(path)
    df = real_df.copy()
    # remove the following columns <TICKVOL>, <VOL> and <SPREAD>
    df = df.drop(['<SPREAD>', '<VOL>'], axis=1)
    df = df.rename(columns={'<DATE>': 'Date', 
                                    '<TIME>': 'Time', 
                                    '<OPEN>': 'Open', 
                                    '<HIGH>': 'High', 
                                    '<LOW>': 'Low', 
                                    '<CLOSE>': 'Close',
                                    '<TICKVOL>': 'Volume',
                                    })
    # combine the date and time columns
    df['Date_Time'] = df['Date'] + ' ' + df['Time']
    df = df.drop(['Date', 'Time'], axis=1)
    df['Time'] = pd.to_datetime(df['Date_Time'])
    df = df.drop(['Time'], axis=1)
    prices = df["Close"].values
    
    df["SMA_20"] = talib.SMA(prices, timeperiod=20)
    df["SMA_30"] = talib.SMA(prices, timeperiod=30)
    df["SMA_50"] = talib.SMA(prices, timeperiod=50)
    df["SMA_100"] = talib.SMA(prices, timeperiod=100)
    df["SMA_200"] = talib.SMA(prices, timeperiod=200)
    
    df["MACD"], df["MACD_Signal"], df["MACD_Hist"] = talib.MACD(df["Close"])
    df["MACD_Crossover"] = np.where(df["MACD"] > df["MACD_Signal"], 1, -1)
    df["MACD_Crossover_Change"] = df["MACD_Crossover"].diff()
    
    # RSI (Relative Strength Index)
    df["RSI"] = talib.RSI(prices)
    
    # MACD (Moving Average Convergence Divergence)
    df["MACD"], df["MACD_Signal"], df["MACD_Hist"] = talib.MACD(df["Close"])
    
    # ATR (Average True Range)
    df["ATR"] = talib.ATR(df["High"], df["Low"], df["Close"])
    
    # ADX (Average Directional Index)
    df["ADX"] = talib.ADX(df["High"], df["Low"], df["Close"])
    
    # Aroon Oscillator
    df["AROON_Oscillator"] = talib.AROONOSC(df["High"], df["Low"])
    
    # OBV (On-Balance Volume)
    df["OBV"] = talib.OBV(df["Close"], df["Volume"])
    
    # CCI (Commodity Channel Index)
    df["CCI"] = talib.CCI(df["High"], df["Low"], df["Close"])
    
    # Parabolic SAR
    df["PSAR"] = talib.SAR(df["High"], df["Low"])
    
    # Stochastic Oscillator
    df["STOCH_K"], df["STOCH_D"] = talib.STOCH(df["High"], df["Low"], df["Close"])
    
    # Williams %R
    df["WILLR"] = talib.WILLR(df["High"], df["Low"], df["Close"])
    
    # Bollinger Bands
    df["BBANDS_Upper"], df["BBANDS_Middle"], df["BBANDS_Lower"] = talib.BBANDS(df["Close"])
    
    # AD (Chaikin A/D Line)
    df["AD"] = talib.AD(df["High"], df["Low"], df["Close"], df["Volume"])
    
    # ADOSC (Chaikin A/D Oscillator)
    df["ADOSC"] = talib.ADOSC(df["High"], df["Low"], df["Close"], df["Volume"])
    
    # TODO: VWAP (Volume Weighted Average Price)
    # df["VWAP"] =  ta.vwap()
    
    # VOLUME RSI
    df["VOLUME_RSI"] = talib.RSI(df["Volume"])
    
    # MFI (Money Flow Index)
    df["MFI"] = talib.MFI(df["High"], df["Low"], df["Close"], df["Volume"])
    
    df = df.dropna()
    return df

eur_usd_h1_path = '/projects/genomic-ml/da2343/ml_project_2/data/EURUSD/EURUSD_H1_200702210000_202304242100.tsv'

df = extract_df(eur_usd_h1_path)
# save the df to a csv file
df.to_csv(f'/projects/genomic-ml/da2343/ml_project_2/data/EURUSD/EURUSD_H1_200702210000_202304242100_Update.csv', index=True)

In [5]:

for data_path in data_file_paths:
    df = extract_df(data_path)
    # remove extension from data_path
    data_path_raw = data_path.split('.')[0]
    # data_path_raw = data_path.split('/')[-1]
    new_path = data_path_raw + f'_Update.csv'
    # save the df to a csv file
    df.to_csv(new_path, index=True)

KeyError: "['<SPREAD>', '<VOL>'] not found in axis"

In [6]:
data_path

'/projects/genomic-ml/da2343/ml_project_2/data/GBPAUD/GBPAUD_H1_200708211800_202307282300_Update.csv'