# Predict next 5 days change for a pair of currency rate

In [None]:
pair_currency = 'EUNO'
currency1 = pair_currency[:2]
currency2 = pair_currency[2:]


# install and import

In [None]:
%%capture
import sys

!pip install category_encoders==2.*
!pip install pdpbox
!pip install shap
!pip install --upgrade numpy==1.19.1
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='xgboost')

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import os

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV

# encoders
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, r2_score, \
 classification_report, roc_auc_score, plot_confusion_matrix, classification_report

# pipeline
from sklearn.pipeline import make_pipeline

# machine learning
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Boosted Models
# Use this one if you have an M1 chip.
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost.sklearn import XGBRegressor

# Permutation Importance
from sklearn.inspection import permutation_importance

# for displaying images and html
from IPython.display import Image
from IPython.core.display import HTML 

# Partial Dependence Plot
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot

# shap
import shap

# Wrangling data

In [None]:
pd.read_csv("newEUNZ.csv", parse_dates=['DATE'], index_col=0)

In [None]:
def wrangle(file):
  df = pd.read_csv(file, parse_dates=['DATE'], index_col=0)
  df.dropna(inplace=True)
  df = df.sort_index(ascending=True)
  df = df.applymap(lambda x: float(x))
  if file[5:7] == currency1:
    df['close'] = 1/df['close']
  print('------file started: ', file)

  # add more features - X
  df['today_change'] = df['close'] / df['close'].shift(1) - 1
  
  df['5days_change'] = df['close']/df['close'].shift(5) - 1
  df['10days_change'] = df['close']/df['close'].shift(10) - 1
  df['30days_change'] = df['close']/df['close'].shift(30) - 1
  df['60days_change'] = df['close']/df['close'].shift(60) - 1
  
  df['bias_10days_ave'] = df['close']/df['close'].rolling(window=10).mean() - 1
  df['bias_30days_ave'] = df['close']/df['close'].rolling(window=30).mean() - 1
  df['bias_60days_ave'] = df['close']/df['close'].rolling(window=60).mean() - 1
  df['bias_120days_ave'] = df['close']/df['close'].rolling(window=120).mean() - 1
  df['bias_120days_ave'] = df['bias_120days_ave'].fillna(df['bias_60days_ave'])

  # target - y
  df['next5days_change'] = df['close'].shift(-5)/df['close'] - 1
 
  df.drop(columns=['close'], inplace=True)
  
  if file[5:7] == currency1:
    df.columns = file[5:7] + file[3:5] + '_' + df.columns
  else:
    df.columns = file[3:7] + '_' + df.columns
  print('features added finished: ------', file)
  return df

In [None]:
data = pd.DataFrame(columns=['DATE'])
data = data.set_index('DATE')
dir = os.getcwd()
for f in os.listdir(dir):
  if f.find(currency1) != -1:
    df = wrangle(f)
    data = pd.concat([df, data], axis=1,)

In [None]:
for f in os.listdir(dir):
  if (f[-6:-4] == currency2) & (f[3:7] != pair_currency):
    print(f)
    df = wrangle(f)
    data = pd.concat([df, data], axis=1)
    print(data.shape)
data

In [None]:
# add more feature to all data
today_change_columns = [c for c in data.columns if c.find('today_change') != -1]
days5_change_columns = [c for c in data.columns if c.find('5days_change') != -1]
days10_change_columns = [c for c in data.columns if c.find('10days_change') != -1]

data['max_today_change'] = data[today_change_columns].max(axis=1)
data['max_5days_change'] = data[days5_change_columns].max(axis=1)
data['max_10days_change'] = data[days10_change_columns].max(axis=1)
data['min_today_change'] = data[today_change_columns].min(axis=1)
data['min_5days_change'] = data[days5_change_columns].min(axis=1)
data['min_10days_change'] = data[days10_change_columns].min(axis=1)

bias_10days_columns = [c for c in data.columns if c.find('bias_10days') != -1]
bias_30days_columns = [c for c in data.columns if c.find('bias_30days') != -1]
bias_60days_columns = [c for c in data.columns if c.find('bias_60days') != -1]
bias_120days_columns = [c for c in data.columns if c.find('bias_120days') != -1]

data['max_bias_10days'] = data[bias_10days_columns].max(axis=1)
data['max_bias_30days'] = data[bias_30days_columns].max(axis=1)
data['max_bias_60days'] = data[bias_60days_columns].max(axis=1)

data['min_bias_10days'] = data[bias_10days_columns].min(axis=1)
data['min_bias_30days'] = data[bias_30days_columns].min(axis=1)
data['min_bias_60days'] = data[bias_60days_columns].min(axis=1)


In [None]:
data.dropna(axis=0, thresh= 60, inplace=True)
data

In [None]:
print(data.columns)
data.tail(20)

## Prepare training and test data

In [None]:
data1 = data[data[pair_currency + '_next5days_change'].notna()].copy()
target = pair_currency + '_next5days_change'
X = data1.drop(columns=[c for c in data1.columns if c.find('next5days') != -1])
y = data1[target]
X.columns

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train

###target training data distribution

In [None]:
y_train.plot(kind='hist')

# Ridge Regression

## Baseline error

In [None]:
baseline = mean_absolute_error(y_train, [y_train.mean()] * len(y_train))
print('Baseline error: ', baseline)

In [None]:
model_lr = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    Ridge(random_state=42)
)
model_lr.fit(X_train, y_train)

## Ridge absolute error

In [None]:
print("train error: ", mean_absolute_error(y_train, model_lr.predict(X_train)))
print("validation error: ", mean_absolute_error(y_val, model_lr.predict(X_val)))


## Tuning Ridge Regression

In [None]:
train_mean_absolute_errors = []
val_mean_absolute_errors = []
alphas = np.arange(0,2,0.1)
print(alphas)
for a in alphas:
  model_lr = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    Ridge(random_state=42, alpha=a)
)
  model_lr.fit(X_train, y_train)
  train_mean_absolute_errors.append(mean_absolute_error(y_train, model_lr.predict(X_train)))
  val_mean_absolute_errors.append(mean_absolute_error(y_val, model_lr.predict(X_val)))


### Ridge absolute error graph

In [None]:
#plt.plot(alphas, train_mean_absolute_errors, color='red')
plt.plot(alphas, val_mean_absolute_errors, color='blue')

###Tuned Ridge absolute error

In [None]:
best_alpha = alphas[val_mean_absolute_errors.index(min(val_mean_absolute_errors))]
print('best alpha: ', best_alpha)
model_lr = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    Ridge(random_state=42, alpha = best_alpha)
)
model_lr.fit(X_train, y_train)
train_error = mean_absolute_error(y_train, model_lr.predict(X_train))
validation_error = mean_absolute_error(y_val, model_lr.predict(X_val))
print("train error: ", train_error)
print("validation error: ", validation_error)

# Classified models

## prepare classified target: -1, 0 or 1 (sell, non, buy)

## Baseline score

In [None]:
y_logistic = data1[target].apply(lambda x: -1 if x <= -0.005 else(0 if x < 0.005 else 1))
X_train, X_val, y_train_c, y_val_c = train_test_split(X, y_logistic, test_size=0.2)
X_train.dtypes, y_train_c.dtypes

In [None]:
print(y_train_c.value_counts())
baseline = y_train_c.value_counts(normalize=True).max()
print('Baseline accuracy score: ', baseline)

# Logistic Regression

In [None]:
model_lg = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(random_state=42, n_jobs=-1)
)
model_lg.fit(X_train, y_train_c)

##Acuuracy score

In [None]:
print("train accuracy_score: ", accuracy_score(y_train_c, model_lg.predict(X_train)))
print("validation accuracy_score: ", accuracy_score(y_val_c, model_lg.predict(X_val)))

## Tuning LogisticRegression

In [None]:
train_accuracy = []
val_accuracy = []
max_iters = np.arange(40,700,20)
print(max_iters)
for iter in max_iters:
  model_lg = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(random_state=42, n_jobs=-1, max_iter=iter)
)
  model_lg.fit(X_train, y_train_c)
  train_accuracy.append(accuracy_score(y_train_c, model_lg.predict(X_train)))
  val_accuracy.append(accuracy_score(y_val_c, model_lg.predict(X_val)))

### Logistic Regression validation accuracy score graph

In [None]:
#plt.plot(max_iters, train_accuracy, color='red')
plt.plot(max_iters, val_accuracy, color='blue')

###Tuned Logistic Regression Accuracy

In [None]:
best_max_iter = max_iters[val_accuracy.index(max(val_accuracy))]
print('best max iter:' , best_max_iter)
model_lg = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(random_state=42, n_jobs=-1, max_iter = best_max_iter)
)
model_lg.fit(X_train, y_train_c)

train_accuracy = accuracy_score(y_train_c, model_lg.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_lg.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

# RandomForest Classifier

In [None]:
model_rf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

model_rf.fit(X_train, y_train_c)

##Accuracy score

In [None]:
print(accuracy_score(y_train_c, model_rf.predict(X_train)))
print(accuracy_score(y_val_c, model_rf.predict(X_val)))

## Tuning RandomForestClassifier

In [None]:
clf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1)
)

param_grid = {
    'randomforestclassifier__random_state': range(28,46,2),
    'randomforestclassifier__n_estimators': range(70,86,2),
    'randomforestclassifier__max_depth': range(20,34,2),
    'randomforestclassifier__min_samples_split': range(2,4,1)
}

model_rfrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 10,
    verbose = 1,
    n_iter = 20
)

model_rfrs.fit(X_train, y_train_c)

In [None]:
best_score = model_rfrs.best_score_
best_params = model_rfrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_rfrs.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_rfrs.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

# XGB Classifier

In [None]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42)
)

model_xgb.fit(X_train, y_train_c)

##Accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_xgb.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_xgb.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

## Tuning XGB Classifier

In [None]:
clf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42)
)

param_grid = {
    'xgbclassifier__n_estimators': range(70, 90, 2),
    'xgbclassifier__learning_rate': np.arange(0,0.2,0.02),
    'xgbclassifier__max_depth': range(4,12,1),
    'xgbclassifier__max_features': range(30,48,2),
    'xgbclassifier__subsample': np.linspace(0.8,1.2,10)
}

model_xgbrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 20,
    verbose = 1,
    n_iter = 30
)

model_xgbrs.fit(X_train, y_train_c)

In [None]:
best_score = model_xgbrs.best_score_
best_params = model_xgbrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_xgbrs.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_xgbrs.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

## Feature importance

In [None]:
importances = model_rf.named_steps['randomforestclassifier'].feature_importances_
columns = X_train.columns
df_importances = pd.DataFrame(data=importances, index=X_train.columns, columns=["importance"])
df_importances.abs().sort_values(by=['importance']).tail(10).plot(kind="barh")

In [None]:
df_importances.abs().sort_values(by=['importance'])[-20:]

## PDP plot

In [None]:
from pdpbox import pdp
features = X_train.columns
feature = 'EUNO_bias_10days_ave'
#feature = "max_5days_chang"
pdp_dist = pdp.pdp_isolate(model=model_rfrs, dataset=X_train, model_features=features, feature=feature)
pdp.pdp_plot(pdp_dist, feature);

In [None]:
features = ['max_5days_change', 'min_5days_change']

interaction = pdp_interact(
    model=model_rfrs, 
    dataset=X_train, 
    model_features=X_train.columns, 
    features=features
)

pdp_interact_plot(interaction, plot_type='grid', feature_names=features);

# Confusion Matrix for RandomForestClassifier

In [None]:
plot_confusion_matrix(
    model_rfrs,
    X_val,
    y_val_c,
    values_format = '.0f',
    display_labels = ['fall', 'None', 'rise']
)

## Precision

In [None]:
print('Fall precision: ', 173/(173 + 24 + 16))
print('Rise precision: ', 167/(184 + 46 + 27))