In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from functools import reduce

pd.set_option('display.max_rows', 10000)

In [2]:
def combine_indicators(currency, pair):
    
    cpi = pd.read_csv("../data/processed/cpi/{}_cpi_processed.csv".format(currency))
    gdp = pd.read_csv("../data/processed/gdp/{}_gdp_processed.csv".format(currency))
    ir = pd.read_csv("../data/processed/interest_rate/{}_ir_processed.csv".format(currency))
    ppi = pd.read_csv("../data/processed/ppi/{}_ppi_processed.csv".format(currency))
    ue = pd.read_csv("../data/processed/unemployment_rate/{}_ue_processed.csv".format(currency))
    news = pd.read_csv("../data/processed/news/news_sentiment.csv")
    news = news[{"Time", currency.upper()}]
    news = news.rename(columns={currency.upper(): "News Sentiment"})
    tweets = pd.read_csv("../data/processed/tweets/tweets_sentiment.csv")
    tweets = tweets[{"Time", currency.upper()}]
    tweets = tweets.rename(columns={currency.upper(): "Twitter Sentiment"})
    
    combined_df = merge_dataframe([cpi, gdp, ir, ppi, ue, news, tweets])
    
    if currency.upper() in pair:
        exchange_rate = pd.read_csv("../data/processed/exchange_rate/{}_exchange.csv".format(pair))
        combined_df = merge_dataframe([combined_df, exchange_rate])

    combined_df = combined_df[combined_df["RSI"].notnull()]
    
    return combined_df
    
def merge_dataframe(data_list):
    merged_data = reduce(lambda left, right : pd.merge(left, right, how="outer", on="Time"), data_list)     
    merged_data.sort_values(by=["Time"], inplace=True)
    merged_data = merged_data.reset_index(drop=True)
    return merged_data

def currency_pair(buy, sell):
    pair = (buy + sell).upper()
    buy_df = combine_indicators(buy, pair)
    buy = buy.upper()
    buy_df = buy_df.rename(columns={
            "CPI": buy + "_CPI", 
            "GDP": buy + "_GDP", 
            "Interest Rate": buy + " Interest Rate",
            "PPI": buy + "_PPI",
            "Unemployment Rate": buy + " Unemployment Rate",
            "News Sentiment": buy + " News Sentiment",
            "Twitter Sentiment": buy + " Twitter Sentiment",
        })
    buy_df = buy_df.reset_index(drop=True)
    sell_df = combine_indicators(sell, pair)
    sell_df = sell_df[{"Time", "CPI", "GDP", "Interest Rate", "PPI", "Unemployment Rate", "News Sentiment", "Twitter Sentiment"}]
    sell = sell.upper()
    sell_df = sell_df.rename(columns={
            "CPI": sell + "_CPI", 
            "GDP": sell + "_GDP", 
            "Interest Rate": sell + " Interest Rate",
            "PPI": sell + "_PPI",
            "Unemployment Rate": sell + " Unemployment Rate",
            "News Sentiment": sell + " News Sentiment",
            "Twitter Sentiment": sell + " Twitter Sentiment",
        })
    sell_df.reset_index(drop=True)
    pair_df = buy_df.merge(sell_df, how="inner", on=["Time"])
    pair_df['Time'] = pd.to_datetime(pair_df['Time'], utc=True)
    cool = pd.read_csv("../data/external/exchange_rates/EURUSD_M1.csv")
    cool = cool.rename(columns={"DateTime": "Time", "Close": "Real Close"})
    cool = cool[{'Time', 'Real Close'}]
    cool = pd.DataFrame(cool, columns=['Time', 'Real Close'])
    cool = convert_date(cool)
    cool['Time'] = pd.to_datetime(cool['Time'], utc=True)
    pair_df = pair_df.merge(cool, how="left", on=["Time"])
    pair_df['Real Close'].interpolate(method='linear', inplace=True)
    pair_df = configure_time(15, pair_df)
    pair_df.to_csv("../data/processed/{}_processed.csv".format(pair), index=False)
    return pair_df

def configure_time(minutes, dataframe):
    dataframe["Volume"] = dataframe["Volume"].rolling(minutes, min_periods=1).sum()
    dataframe["High"] = dataframe["High"].rolling(minutes, min_periods=1).max()
    dataframe["Low"] = dataframe["Low"].rolling(minutes, min_periods=1).min()
    
    time_frame = pd.date_range(start="2018-01-01 22:00:00", freq="{}T".format(minutes), end="2020-12-31 21:59:00")
    time_frame = pd.DataFrame(time_frame, columns=["Time"])
    time_frame["Time"] = time_frame["Time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    time_frame['Time'] = pd.to_datetime(time_frame['Time'], utc=True)
    
    configured_df = time_frame.merge(dataframe, how="inner", on="Time")
    configured_df.at[0, "Volume"] = configured_df.at[1, "Volume"]
    configured_df["Open"] = configured_df["Open"].shift(1)
    configured_df.at[0, "Open"] = configured_df.at[1, "Open"]
    
    return configured_df

def convert_date(exchange):
    exchange["Time"] = pd.to_datetime(exchange["Time"], format="%Y-%m-%d %H:%M:%S")
    return exchange

In [3]:
pair = currency_pair("aud", "cad")
pair

Unnamed: 0,Time,AUD_CPI,AUD_GDP,AUD Interest Rate,AUD_PPI,AUD Unemployment Rate,AUD News Sentiment,AUD Twitter Sentiment,Volume,High,...,RSI,A/D Index,CAD_GDP,CAD Twitter Sentiment,CAD_CPI,CAD News Sentiment,CAD Interest Rate,CAD Unemployment Rate,CAD_PPI,Real Close
0,2018-01-01 22:15:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,352.0,0.000654,...,31.590704,-22.588235,1.086920,0.0000,1.698842,0.0,2.198182,5.8,0.015110,1.20062
1,2018-01-01 22:30:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,352.0,0.000337,...,44.688610,2.333333,1.086920,0.0000,1.698842,0.0,2.198182,5.8,0.015110,1.20076
2,2018-01-01 22:45:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,332.0,0.000215,...,58.123929,13.333333,1.086920,0.0000,1.698842,0.0,2.198182,5.8,0.015110,1.20139
3,2018-01-01 23:00:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,840.0,0.000459,...,73.020522,-53.260274,1.086920,0.0000,1.698842,0.0,2.198182,5.8,0.015110,1.20120
4,2018-01-01 23:15:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,2092.0,0.000388,...,52.404668,-89.473684,1.086920,0.0000,1.698842,0.0,2.198182,5.8,0.015110,1.20147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74686,2020-12-31 20:45:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,840.0,0.000204,...,58.750751,-112.000000,8.874858,0.1589,0.733138,0.0,0.730952,8.6,0.023899,1.22156
74687,2020-12-31 21:00:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,1428.0,0.000163,...,48.583656,36.242424,8.874858,0.1589,0.733138,0.0,0.730952,8.6,0.023899,1.22151
74688,2020-12-31 21:15:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,480.0,0.000194,...,44.748147,21.000000,8.874858,0.1589,0.733138,0.0,0.730952,8.6,0.023899,1.22242
74689,2020-12-31 21:30:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,560.0,0.000183,...,36.511121,29.333333,8.874858,0.1589,0.733138,0.0,0.730952,8.6,0.023899,1.22210
