# Forex Feature Engineering and Machine Learning


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

# enable access to local data and python scripts 
os.chdir("C:\\Users\\admin\\Desktop\\Dev\\MachineLearning\\Forex")


In [4]:
# import data that was sorted in previous notebook
df = pd.read_csv('sorted_historical_forex_rates.csv')

# view data
df.head()
df.tail()

Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF
2158,1101574800,2004-11-27,USD,1.0,0.753901,102.796629,0.528127,1.141666
2159,1101661200,2004-11-28,USD,1.0,0.753901,102.796629,0.528127,1.141666
2160,1101747600,2004-11-29,USD,1.0,0.753352,102.779088,0.52837,1.142787
2161,1101834000,2004-11-30,USD,1.0,0.753077,102.883425,0.523805,1.139132
2162,1101920400,2004-12-01,USD,1.0,0.751267,102.798476,0.518656,1.142903


# Machine Learning Question

Given the history of a currency's relative value, can the machine accurately predict if that currency's relative value will rise or fall in the next interval of time?

# Features and Labels

We will start off by using as few fetures as possible.

Features:
- The difference in value from 1 day ago to now
- The difference in value from 2 days ago to 1 day ago
- The difference in value from 3 days ago to 2 days ago

If these feature values were plotted, you would essentially end up with a graph that represents the 3 day history of that currency's value up until it's "current" point.

Labels:
- '1' : increase
- '0' : stay the same
- '-1': decrease
- NOTE: not entirely sure I can use negative numbers as labels in scikit learn...


So basically, the question is changed a little in this case. Given the value of a currency at any point in time and its 3 day history leading up to that, can the machine accurately predict if it will increase, decrease, or stay the same in the next day?

NOTE: since this is the first model, the plan is to be as simple as possible so I can have a baseline to compare the accuracy of future more advanced models to. Therefore, this model will be as "inside-the-box" as possible.

In [5]:
# creating data for the features
"""
 NOTE: that in this process, data entries in the beginning will not have a 3-day history
 because there is no data before that entry to compute those values.
 Those data entries will not be included in the training process but will be used to form data history 
 for the entries that follow them.
"""


def find_min_max_index(df):
    """ returns min and max indices for the data entries that will have features and labels."""
   
    # use indices to iterate through the rows of the dataframe
    num_rows = df.shape[0]
    last_index = num_rows - 1
    
    min_index = 3
    max_index = last_index - 1
    
    return min_index, max_index


min_index, max_index = find_min_max_index(df)

# DEBUG:
# print(min_index, max_index)


# the difference between 1 day ago and today
def one_day_to_current(df, min_index, max_index, curr):
    """ Returns a new datframe with an additional column that shows how tha value of currency relative to another currency
        has changed from the previous day to the 'current' day.
        
        NOTE: the dataframe must be sorted and have every date in between the earliest date recorded and the 
        most recent date recorded. """
    
    # test whether this modifying the original df or if the function has made a copy of the df and is modifiying that instead.
    # ...it modifies the original
    # df['Hello'] = 'Hello'
    
    # create a copy of df
    new_df = df.copy()
    
    # create empty column of empty values
    column_name = 'one_day_to_current_' + curr
    new_df[column_name] = np.nan
    
    # iterate from the min index to the max index, calculate the difference, store in column
    curr_index = min_index
    
    while curr_index <= max_index:
        # calculate today value - yesterday value
        one_day_diff =  new_df.loc[curr_index, curr] - new_df.loc[curr_index-1, curr]
        
        # store the value
        new_df.loc[curr_index, column_name] = one_day_diff
        
        # increment index
        curr_index += 1
    
    
    return new_df

df = one_day_to_current(df, min_index, max_index, 'JPY')

# view the data
df


Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF,one_day_to_current_JPY
0,915235199,1999-01-01,USD,1.0,0.853515,113.646914,0.602941,1.375050,
1,915321599,1999-01-02,USD,1.0,0.854441,113.663894,0.603617,1.376000,
2,915407999,1999-01-03,USD,1.0,0.854441,113.663894,0.603617,1.376000,
3,915494399,1999-01-04,USD,1.0,0.846629,112.599539,0.603375,1.368618,-1.064354
4,915580799,1999-01-05,USD,1.0,0.849454,111.138262,0.604036,1.369318,-1.461277
5,915667199,1999-01-06,USD,1.0,0.859536,112.491672,0.603662,1.383910,1.353410
6,915753599,1999-01-07,USD,1.0,0.856267,111.257463,0.606278,1.384102,-1.234209
7,915839999,1999-01-08,USD,1.0,0.863182,111.302755,0.608586,1.390139,0.045292
8,915926399,1999-01-09,USD,1.0,0.861603,111.221927,0.608229,1.387690,-0.080829
9,916012799,1999-01-10,USD,1.0,0.861603,111.221927,0.608229,1.387690,0.000000


In [6]:
# the difference between 2 day ago and one day ago
def two_days_to_one_day_ago(df, min_index, max_index, curr):
    """ Returns a new datframe with an additional column that shows how tha value of currency relative to another currency
        has changed from two days ago to one day ago.
        
        NOTE: the dataframe must be sorted and have every date in between the earliest date recorded and the 
        most recent date recorded. """
    
    # test whether this modifying the original df or if the function has made a copy of the df and is modifiying that instead.
    # ...it modifies the original
    # df['Hello'] = 'Hello'
    
    # create a copy of df
    new_df = df.copy()
    
    # create empty column of empty values
    column_name = 'two_days_to_one_day_ago_' + curr
    new_df[column_name] = np.nan
    
    # iterate from the min index to the max index, calculate the difference, store in column
    curr_index = min_index
    
    while curr_index <= max_index:
        # calculate 1 day ago value - 2 days ago value
        one_day_diff =  new_df.loc[curr_index - 1, curr] - new_df.loc[curr_index-2, curr]
        
        # store the value
        new_df.loc[curr_index, column_name] = one_day_diff
        
        # increment index
        curr_index += 1
    
    
    return new_df


df = two_days_to_one_day_ago(df, min_index, max_index, 'JPY')

# view the data
df

Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF,one_day_to_current_JPY,two_days_to_one_day_ago_JPY
0,915235199,1999-01-01,USD,1.0,0.853515,113.646914,0.602941,1.375050,,
1,915321599,1999-01-02,USD,1.0,0.854441,113.663894,0.603617,1.376000,,
2,915407999,1999-01-03,USD,1.0,0.854441,113.663894,0.603617,1.376000,,
3,915494399,1999-01-04,USD,1.0,0.846629,112.599539,0.603375,1.368618,-1.064354,0.000000
4,915580799,1999-01-05,USD,1.0,0.849454,111.138262,0.604036,1.369318,-1.461277,-1.064354
5,915667199,1999-01-06,USD,1.0,0.859536,112.491672,0.603662,1.383910,1.353410,-1.461277
6,915753599,1999-01-07,USD,1.0,0.856267,111.257463,0.606278,1.384102,-1.234209,1.353410
7,915839999,1999-01-08,USD,1.0,0.863182,111.302755,0.608586,1.390139,0.045292,-1.234209
8,915926399,1999-01-09,USD,1.0,0.861603,111.221927,0.608229,1.387690,-0.080829,0.045292
9,916012799,1999-01-10,USD,1.0,0.861603,111.221927,0.608229,1.387690,0.000000,-0.080829


In [7]:
# the difference between 3 days ago and 2 day ago
def three_days_to_two_days_ago(df, min_index, max_index, curr):
    """ Returns a new datframe with an additional column that shows how the value of base relative to another currency
        has changed from three days ago to two days ago.
        
        NOTE: the dataframe must be sorted and have every date in between the earliest date recorded and the 
        most recent date recorded. """
    
    # test whether this modifying the original df or if the function has made a copy of the df and is modifiying that instead.
    # ...it modifies the original
    # df['Hello'] = 'Hello'
    
    # create a copy of df
    new_df = df.copy()
    
    # create empty column of empty values
    column_name = 'three_days_to_two_days_ago_' + curr
    new_df[column_name] = np.nan
    
    # iterate from the min index to the max index, calculate the difference, store in column
    curr_index = min_index
    
    while curr_index <= max_index:
        # calculate 2 days ago value - 3 days ago value
        one_day_diff =  new_df.loc[curr_index - 2, curr] - new_df.loc[curr_index-3, curr]
        
        # store the value
        new_df.loc[curr_index, column_name] = one_day_diff
        
        # increment index
        curr_index += 1
    
    
    return new_df


df = three_days_to_two_days_ago(df, min_index, max_index, 'JPY')

# view the data
df

Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF,one_day_to_current_JPY,two_days_to_one_day_ago_JPY,three_days_to_two_days_ago_JPY
0,915235199,1999-01-01,USD,1.0,0.853515,113.646914,0.602941,1.375050,,,
1,915321599,1999-01-02,USD,1.0,0.854441,113.663894,0.603617,1.376000,,,
2,915407999,1999-01-03,USD,1.0,0.854441,113.663894,0.603617,1.376000,,,
3,915494399,1999-01-04,USD,1.0,0.846629,112.599539,0.603375,1.368618,-1.064354,0.000000,0.016980
4,915580799,1999-01-05,USD,1.0,0.849454,111.138262,0.604036,1.369318,-1.461277,-1.064354,0.000000
5,915667199,1999-01-06,USD,1.0,0.859536,112.491672,0.603662,1.383910,1.353410,-1.461277,-1.064354
6,915753599,1999-01-07,USD,1.0,0.856267,111.257463,0.606278,1.384102,-1.234209,1.353410,-1.461277
7,915839999,1999-01-08,USD,1.0,0.863182,111.302755,0.608586,1.390139,0.045292,-1.234209,1.353410
8,915926399,1999-01-09,USD,1.0,0.861603,111.221927,0.608229,1.387690,-0.080829,0.045292,-1.234209
9,916012799,1999-01-10,USD,1.0,0.861603,111.221927,0.608229,1.387690,0.000000,-0.080829,0.045292


Now for the label: How does the value change in the next day?

In [8]:
# the difference between 3 days ago and 2 day ago
def current_to_next_day(df, min_index, max_index, curr):
    """ Returns a new datframe with an additional column that shows how the value of base relative to another currency
        will chnage from today to tomorrow.
        
        NOTE: the dataframe must be sorted and have every date in between the earliest date recorded and the 
        most recent date recorded. """
    
    # test whether this modifying the original df or if the function has made a copy of the df and is modifiying that instead.
    # ...it modifies the original
    # df['Hello'] = 'Hello'
    
    # create a copy of df
    new_df = df.copy()
    
    # create empty column of empty values
    column_name = 'current_to_next_day_' + curr
    new_df[column_name] = np.nan
    
    # iterate from the min index to the max index, calculate the difference, store in column
    curr_index = min_index
    
    while curr_index <= max_index:
        # calculate next day - current day value
        one_day_diff =  new_df.loc[curr_index +1, curr] - new_df.loc[curr_index, curr]
        
        # store the value
        if one_day_diff > 0:
            new_df.loc[curr_index, column_name] = 1
        elif one_day_diff < 0:
            new_df.loc[curr_index, column_name] = -1
        elif one_day_diff == 0:
            new_df.loc[curr_index, column_name] = 0    
        
        # increment index
        curr_index += 1
    
    
    return new_df


df = current_to_next_day(df, min_index, max_index, 'JPY')

# view the data
df

Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF,one_day_to_current_JPY,two_days_to_one_day_ago_JPY,three_days_to_two_days_ago_JPY,current_to_next_day_JPY
0,915235199,1999-01-01,USD,1.0,0.853515,113.646914,0.602941,1.375050,,,,
1,915321599,1999-01-02,USD,1.0,0.854441,113.663894,0.603617,1.376000,,,,
2,915407999,1999-01-03,USD,1.0,0.854441,113.663894,0.603617,1.376000,,,,
3,915494399,1999-01-04,USD,1.0,0.846629,112.599539,0.603375,1.368618,-1.064354,0.000000,0.016980,-1.0
4,915580799,1999-01-05,USD,1.0,0.849454,111.138262,0.604036,1.369318,-1.461277,-1.064354,0.000000,1.0
5,915667199,1999-01-06,USD,1.0,0.859536,112.491672,0.603662,1.383910,1.353410,-1.461277,-1.064354,-1.0
6,915753599,1999-01-07,USD,1.0,0.856267,111.257463,0.606278,1.384102,-1.234209,1.353410,-1.461277,1.0
7,915839999,1999-01-08,USD,1.0,0.863182,111.302755,0.608586,1.390139,0.045292,-1.234209,1.353410,-1.0
8,915926399,1999-01-09,USD,1.0,0.861603,111.221927,0.608229,1.387690,-0.080829,0.045292,-1.234209,0.0
9,916012799,1999-01-10,USD,1.0,0.861603,111.221927,0.608229,1.387690,0.000000,-0.080829,0.045292,-1.0


The features and labels have been created. Now, it is time to discard the unusable data at the head and at the tail and then start training our model.

In [9]:
# drop first 3 entries
df = df.drop(df.head(3).index)

# drop last entry
df = df.drop(df.tail(1).index)

df

Unnamed: 0,timestamp,date,base,USD,EUR,JPY,GBP,CHF,one_day_to_current_JPY,two_days_to_one_day_ago_JPY,three_days_to_two_days_ago_JPY,current_to_next_day_JPY
3,915494399,1999-01-04,USD,1.0,0.846629,112.599539,0.603375,1.368618,-1.064354,0.000000,0.016980,-1.0
4,915580799,1999-01-05,USD,1.0,0.849454,111.138262,0.604036,1.369318,-1.461277,-1.064354,0.000000,1.0
5,915667199,1999-01-06,USD,1.0,0.859536,112.491672,0.603662,1.383910,1.353410,-1.461277,-1.064354,-1.0
6,915753599,1999-01-07,USD,1.0,0.856267,111.257463,0.606278,1.384102,-1.234209,1.353410,-1.461277,1.0
7,915839999,1999-01-08,USD,1.0,0.863182,111.302755,0.608586,1.390139,0.045292,-1.234209,1.353410,-1.0
8,915926399,1999-01-09,USD,1.0,0.861603,111.221927,0.608229,1.387690,-0.080829,0.045292,-1.234209,0.0
9,916012799,1999-01-10,USD,1.0,0.861603,111.221927,0.608229,1.387690,0.000000,-0.080829,0.045292,-1.0
10,916099199,1999-01-11,USD,1.0,0.867546,108.925395,0.610982,1.396624,-2.296531,0.000000,-0.080829,1.0
11,916185599,1999-01-12,USD,1.0,0.866135,112.395550,0.613290,1.391166,3.470155,-2.296531,0.000000,1.0
12,916271999,1999-01-13,USD,1.0,0.855782,112.938883,0.605385,1.361045,0.543332,3.470155,-2.296531,1.0


In [10]:
# check that the data types are compatible with scikit learn
# the are all float64 so they should work fine
df[['one_day_to_current_JPY', 'two_days_to_one_day_ago_JPY', 'three_days_to_two_days_ago_JPY', 'current_to_next_day_JPY']].dtypes


one_day_to_current_JPY            float64
two_days_to_one_day_ago_JPY       float64
three_days_to_two_days_ago_JPY    float64
current_to_next_day_JPY           float64
dtype: object

In [11]:
# convert features into numpy arays
X = df[['one_day_to_current_JPY', 'two_days_to_one_day_ago_JPY', 'three_days_to_two_days_ago_JPY']].values

# convert label into numpy array
y = df['current_to_next_day_JPY'].values

# check
print(X.shape)
print(y.shape)

(2159, 3)
(2159,)


# Train/ Test Split

In [12]:
from sklearn.model_selection import train_test_split

# split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# view X_train
X_train

array([[ 0.32838684, -0.42054824,  0.35412554],
       [ 0.9154362 ,  0.        ,  0.04260321],
       [ 0.774521  , -0.591071  ,  0.678365  ],
       ...,
       [ 0.0022646 , -0.101501  ,  0.85847061],
       [ 0.043498  , -1.029001  ,  0.482554  ],
       [ 0.        , -0.07728672, -0.02194294]])

# Logistic Regression

Use a basic linear model as a baseline to compare the performance of other models to.

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

# train the model on the training data
lr.fit(X_train, y_train)

# make predictions
predictions = lr.predict(X_test)

# print(y_test)
# print(predictions)

# use score method to get accuracy of model
score = lr.score(X_test, y_test)
print(score)

0.43209876543209874


In [14]:
# although the low accuracy is expected, I may have implemented the logistic regression model incorrectly.
# the default Regression model perfroms a binary prediction e.g. spam or not spam
# the following model should perform a multi-class prediction

lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')

# train the model on the training data
lr.fit(X_train, y_train)

# make predictions
predictions = lr.predict(X_test)

# print(y_test)
# print(predictions)

# use score method to get accuracy of model
score = lr.score(X_test, y_test)
print(score)

0.43364197530864196


# KNN

Now, to try a different, nonlinear classifier to compare whether the nature of this data is closer to being linear or nonlinear.

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=3)

# fitting the model
knn.fit(X_train, y_train)

# predict the response
pred = knn.predict(X_test)

# evaluate accuracy
print(accuracy_score(y_test, pred))

0.4567901234567901
