In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from functools import reduce

pd.set_option('display.max_rows', 10000)

In [2]:
def combine_indicators(currency, pair):
    
    cpi = pd.read_csv("../data/processed/cpi/{}_cpi_processed.csv".format(currency))
    gdp = pd.read_csv("../data/processed/gdp/{}_gdp_processed.csv".format(currency))
    ir = pd.read_csv("../data/processed/interest_rate/{}_ir_processed.csv".format(currency))
    ppi = pd.read_csv("../data/processed/ppi/{}_ppi_processed.csv".format(currency))
    ue = pd.read_csv("../data/processed/unemployment_rate/{}_ue_processed.csv".format(currency))
    news = pd.read_csv("../data/processed/news/news_sentiment.csv")
    news = news[{"Time", currency.upper()}]
    news = news.rename(columns={currency.upper(): "News Sentiment"})
    tweets = pd.read_csv("../data/processed/tweets/tweets_sentiment.csv")
    tweets = tweets[{"Time", currency.upper()}]
    tweets = tweets.rename(columns={currency.upper(): "Twitter Sentiment"})
    
    combined_df = merge_dataframe([cpi, gdp, ir, ppi, ue, news, tweets])
    
    if currency.upper() in pair:
        exchange_rate = pd.read_csv("../data/processed/exchange_rate/{}_exchange.csv".format(pair))
        combined_df = merge_dataframe([combined_df, exchange_rate])

    combined_df = combined_df[combined_df["RSI"].notnull()]
    
    return combined_df
    
def merge_dataframe(data_list):
    merged_data = reduce(lambda left, right : pd.merge(left, right, how="outer", on="Time"), data_list)
    print(merged_data)
    merged_data.sort_values(by=["Time"], inplace=True)
    merged_data = merged_data.reset_index(drop=True)
    return merged_data

def currency_pair(buy, sell):
    pair = (buy + sell).upper()
    buy_df = combine_indicators(buy, pair)
    buy = buy.upper()
    buy_df = buy_df.rename(columns={
            "CPI": buy + "_CPI", 
            "GDP": buy + "_GDP", 
            "Interest Rate": buy + " Interest Rate",
            "PPI": buy + "_PPI",
            "Unemployment Rate": buy + " Unemployment Rate",
            "News Sentiment": buy + " News Sentiment",
            "Twitter Sentiment": buy + " Twitter Sentiment",
        })
    buy_df = buy_df.reset_index(drop=True)
    sell_df = combine_indicators(sell, pair)
    sell_df = sell_df[{"Time", "CPI", "GDP", "Interest Rate", "PPI", "Unemployment Rate", "News Sentiment", "Twitter Sentiment"}]
    sell = sell.upper()
    sell_df = sell_df.rename(columns={
            "CPI": sell + "_CPI", 
            "GDP": sell + "_GDP", 
            "Interest Rate": sell + " Interest Rate",
            "PPI": sell + "_PPI",
            "Unemployment Rate": sell + " Unemployment Rate",
            "News Sentiment": sell + " News Sentiment",
            "Twitter Sentiment": sell + " Twitter Sentiment",
        })
    sell_df.reset_index(drop=True)
    pair_df = buy_df.merge(sell_df, how="inner", on=["Time"])
    pair_df['Time'] = pd.to_datetime(pair_df['Time'], utc=True)
    #cool = pd.read_csv("../data/external/exchange_rates/{}_M1.csv".format(pair))
    #cool = cool.rename(columns={"DateTime": "Time", "Close": "Real Close"})
    #cool = cool[{'Time', 'Real Close'}]
    #cool = pd.DataFrame(cool, columns=['Time', 'Real Close'])
    #cool = convert_date(cool)
    #cool['Time'] = pd.to_datetime(cool['Time'], utc=True)
    #pair_df = pair_df.merge(cool, how="left", on=["Time"])
    #pair_df['Real Close'].interpolate(method='linear', inplace=True)
    print(pair_df['Real Close'].isna().sum())
    pair_df = configure_time(15, pair_df)
    print(pair_df['Real Close'].isna().sum())
    pair_df.to_csv("../data/processed/{}_processed.csv".format(pair), index=False)
    return pair_df

def configure_time(minutes, dataframe):
    #dataframe["Volume"] = dataframe["Volume"].rolling(minutes, min_periods=1).sum()
    #dataframe["High"] = dataframe["High"].rolling(minutes, min_periods=1).max()
    #dataframe["Low"] = dataframe["Low"].rolling(minutes, min_periods=1).min()
    
    time_frame = pd.date_range(start="2018-01-01 22:00:00", freq="{}T".format(minutes), end="2020-12-31 21:59:00")
    time_frame = pd.DataFrame(time_frame, columns=["Time"])
    time_frame["Time"] = time_frame["Time"].dt.strftime("%Y-%m-%d %H:%M:%S")
    time_frame['Time'] = pd.to_datetime(time_frame['Time'], utc=True)
    
    configured_df = time_frame.merge(dataframe, how="inner", on="Time")
    #configured_df.at[0, "Volume"] = configured_df.at[1, "Volume"]
    #configured_df["Open"] = configured_df["Open"].shift(1)
    #configured_df.at[0, "Open"] = configured_df.at[1, "Open"]
    
    return configured_df

def convert_date(exchange):
    exchange["Time"] = pd.to_datetime(exchange["Time"], format="%Y-%m-%d %H:%M:%S")
    return exchange

In [3]:
pair = currency_pair("aud", "cad")
pair

                        Time  CPI       GDP  Interest Rate       PPI  \
0        2018-01-01 22:00:00  0.6  0.277019           2.75  0.009225   
1        2018-01-01 22:01:00  0.6  0.277019           2.75  0.009225   
2        2018-01-01 22:02:00  0.6  0.277019           2.75  0.009225   
3        2018-01-01 22:03:00  0.6  0.277019           2.75  0.009225   
4        2018-01-01 22:04:00  0.6  0.277019           2.75  0.009225   
...                      ...  ...       ...            ...       ...   
1576795  2020-12-31 21:55:00  1.6  3.328124           0.98 -0.002566   
1576796  2020-12-31 21:56:00  1.6  3.328124           0.98 -0.002566   
1576797  2020-12-31 21:57:00  1.6  3.328124           0.98 -0.002566   
1576798  2020-12-31 21:58:00  1.6  3.328124           0.98 -0.002566   
1576799  2020-12-31 21:59:00  1.6  3.328124           0.98 -0.002566   

         Unemployment Rate  News Sentiment  Twitter Sentiment  
0                 5.501085        0.000000            0.00000  
1      

0
0


Unnamed: 0,Time,AUD_CPI,AUD_GDP,AUD Interest Rate,AUD_PPI,AUD Unemployment Rate,AUD News Sentiment,AUD Twitter Sentiment,Volume,High,...,RSI,A/D Index,Real Close,CAD_CPI,CAD_PPI,CAD Unemployment Rate,CAD Interest Rate,CAD_GDP,CAD News Sentiment,CAD Twitter Sentiment
0,2018-01-01 22:15:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,32.0,-0.003051,...,40.184551,-22.588235,0.97830,1.698842,0.015110,5.8,2.198182,1.086920,0.0,0.0000
1,2018-01-01 22:30:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,28.0,0.000133,...,42.822578,2.333333,0.97861,1.698842,0.015110,5.8,2.198182,1.086920,0.0,0.0000
2,2018-01-01 22:45:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,20.0,0.000194,...,45.254960,13.333333,0.97890,1.698842,0.015110,5.8,2.198182,1.086920,0.0,0.0000
3,2018-01-01 23:00:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,144.0,0.002092,...,56.079024,-53.260274,0.98046,1.698842,0.015110,5.8,2.198182,1.086920,0.0,0.0000
4,2018-01-01 23:15:00+00:00,0.6,0.277019,2.75,0.009225,5.501085,0.000000,0.000000,100.0,-0.000602,...,52.833903,-89.473684,0.98001,1.698842,0.015110,5.8,2.198182,1.086920,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74686,2020-12-31 20:45:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,140.0,0.000652,...,44.101011,-112.000000,0.98171,0.733138,0.023899,8.6,0.730952,8.874858,0.0,0.1589
74687,2020-12-31 21:00:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,52.0,-0.000367,...,41.337914,36.242424,0.98134,0.733138,0.023899,8.6,0.730952,8.874858,0.0,0.1589
74688,2020-12-31 21:15:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,36.0,-0.000428,...,38.725008,21.000000,0.98097,0.733138,0.023899,8.6,0.730952,8.874858,0.0,0.1589
74689,2020-12-31 21:30:00+00:00,1.6,3.328124,0.98,-0.002566,6.825236,-0.005367,0.133967,56.0,-0.000561,...,35.166633,29.333333,0.98042,0.733138,0.023899,8.6,0.730952,8.874858,0.0,0.1589


In [4]:
pair['Real Close'].mul(np.exp(pair['Close']).shift(-1)).shift(1)

0            NaN
1        0.97861
2        0.97890
3        0.98046
4        0.98001
          ...   
74686    0.98171
74687    0.98134
74688    0.98097
74689    0.98042
74690    0.98002
Length: 74691, dtype: float64

In [5]:
pair.isna().sum()

Time                     0
AUD_CPI                  0
AUD_GDP                  0
AUD Interest Rate        0
AUD_PPI                  0
AUD Unemployment Rate    0
AUD News Sentiment       0
AUD Twitter Sentiment    0
Volume                   0
High                     0
Low                      0
Close                    0
Open                     0
EMA_10                   0
EMA_50                   0
RSI                      0
A/D Index                0
Real Close               0
CAD_CPI                  0
CAD_PPI                  0
CAD Unemployment Rate    0
CAD Interest Rate        0
CAD_GDP                  0
CAD News Sentiment       0
CAD Twitter Sentiment    0
dtype: int64