<a href="https://colab.research.google.com/github/ChristineWangcy/Forex-Analysis/blob/main/profile_currency_analysis_1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict next5dayss change for currency EUNO

In [None]:
currency = 'EUNO'

# install and import

In [None]:
%%capture
import sys

!pip install category_encoders==2.*
!pip install pdpbox
!pip install shap
!pip install --upgrade numpy==1.19.1
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='xgboost')

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import os

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV

# encoders
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, r2_score, \
 classification_report, roc_auc_score, plot_confusion_matrix, classification_report

# pipeline
from sklearn.pipeline import make_pipeline

# machine learning
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Boosted Models
# Use this one if you have an M1 chip.
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost.sklearn import XGBRegressor

# Permutation Importance
from sklearn.inspection import permutation_importance

# for displaying images and html
from IPython.display import Image
from IPython.core.display import HTML 

# Partial Dependence Plot
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot

# shap
import shap

# Wrangling data

In [None]:
def wrangle(file):
  df = pd.read_csv(file, parse_dates=['DATE'], index_col=0)
  df.dropna(inplace=True)
  df = df.sort_index(ascending=True)
  df = df.applymap(lambda x: float(x))
  if file[5:7] == currency[:2]:
    df['close'] = 1/df['close']
  print('------file started: ', file)
  # add more features
  df['change'] = df['close'] / df['close'].shift(1) - 1
  df['5days_change'] = df['close']/df['close'].shift(5) - 1
  df['next5days_change'] = df['close'].shift(-5)/df['close'] - 1
  #df['nextday_change'] = df['change'].shift(-1)
  df['bias_5days_ave'] = df['close']/df['close'].rolling(window=5).mean() - 1
  df['bias_10days_ave'] = df['close']/df['close'].rolling(window=10).mean() - 1
  df['bias_30days_ave'] = df['close']/df['close'].rolling(window=30).mean() - 1
  df['bias_60days_ave'] = df['close']/df['close'].rolling(window=60).mean() - 1
  
  df['max_3days'] = (df['close'] == df['close'].rolling(window=3).max()).astype(int)
  df['max_5days'] = (df['close'] == df['close'].rolling(window=5).max()).astype(int)
  df['max_10days'] = (df['close'] == df['close'].rolling(window=10).max()).astype(int)
  df['min_3days'] = (df['close'] == df['close'].rolling(window=3).min()).astype(int)
  df['min_5days'] = (df['close'] == df['close'].rolling(window=5).min()).astype(int)
  df['min_10days'] = (df['close'] == df['close'].rolling(window=10).min()).astype(int)
  
  is_10days_max = ((df['close'] == df['close'].rolling(window=6).max()) & (df['close'] == df['close'].shift(-5).rolling(window=6).max())).astype(int)
  prev_10days_max = (df['close'] * is_10days_max).replace(to_replace=0, method='ffill').shift(6)
  df['above_prev_10days_max'] = (df['close'] > prev_10days_max).astype(int)
  is_10days_min = ((df['close'] == df['close'].rolling(window=6).min()) & (df['close'] == df['close'].shift(-5).rolling(window=6).min())).astype(int)
  prev_10days_min = (df['close'] * is_10days_min).replace(to_replace=0, method='ffill').shift(6)
  df['below_prev_10days_min'] = (df['close'] < prev_10days_min).astype(int)

  df['above_prev_10days_min'] = (df['close'] > prev_10days_min).astype(int)
  df['below_prev_10days_max'] = (df['close'] < prev_10days_max).astype(int)
  
  df['first_above_prev_10days_max'] = ((df['above_prev_10days_max'] == 1) & \
  (df['above_prev_10days_max'] + df['above_prev_10days_max'].shift(1) == 1)).astype(int)
  df['first_below_prev_10days_min'] = ((df['below_prev_10days_min'] == 1) & \
  (df['below_prev_10days_min'] + df['below_prev_10days_min'].shift(1) == 1)).astype(int)
  
  df.drop(columns=['close'], inplace=True)
  
  if file[5:7] == currency[:2]:
    df.columns = file[5:7] + file[3:5] + '_' + df.columns
  else:
    df.columns = file[3:7] + '_' + df.columns
  print('features added finished: ', file)
  #print(df.tail(5), df.isnull().sum())
  
  return df

In [None]:
data = pd.DataFrame(columns=['DATE'])
data = data.set_index('DATE')
dir = os.getcwd()
for f in os.listdir(dir):
  #print(f)
  if f.find(currency[:2]) != -1:
    #print('----start ', f)
    df = wrangle(f)
    #print(data)
    data = pd.concat([df, data], axis=1,)
    #print('data: ', data)

In [None]:
# add more features to all data


currency1_above_prev_10days_min_columns = [c for c in data.columns if c[5:] == 'above_prev_10days_min']
currency1_below_prev_10days_max_columns = [c for c in data.columns if c[5:] == 'below_prev_10days_max']
data['currency1_total_above_prev_10days_min'] = data[currency1_above_prev_10days_min_columns].sum(axis=1)
data['currency1_total_below_prev_10days_max'] = data[currency1_below_prev_10days_max_columns].sum(axis=1)


In [None]:
for f in os.listdir(dir):
  if (f[-6:-4] == currency[2:]) & (f[3:7] != currency):
    print(f)
    df = wrangle(f)
    #print(data.shape)
    data = pd.concat([df, data], axis=1)
    print(data.shape)
print(data.shape, data.isnull().sum())
print(data.tail(40))

In [None]:
# add more feature to all data
currency1 = currency[:2]
currency2 = currency[2:]

data['total_positive_change'] = (data > 0)[[c for c in data.columns if (c.find('_change') != -1 & c.find('next') == -1)]].astype(int).sum(axis=1)
data['total_negative_change'] = (data < 0)[[c for c in data.columns if (c.find('_change') != -1 & c.find('next') == -1)]].astype(int).sum(axis=1)
data['total_currency1_up_change'] = (data > 0)[[c for c in data.columns if ((c.find('_change') != -1) & (c.find(currency1) != -1) & (c.find('next') == -1))]].astype(int).sum(axis=1)
data['total_currency2_down_change'] = (data > 0)[[c for c in data.columns if (c.find(currency2 + '_change') != -1 & c.find('next') == -1)]].astype(int).sum(axis=1)

currency2_above_prev_10days_min_columns = [c for c in data.columns if c[2:] == 'currency2_above_prev_10days_min']
currency2_below_prev_10days_max_columns = [c for c in data.columns if c[2:] == 'currency2_below_prev_10days_max']
data['currency2_total_above_prev_10days_min'] = data[currency2_above_prev_10days_min_columns].sum(axis=1)
data['currency2_total_below_prev_10days_max'] = data[currency2_below_prev_10days_max_columns].sum(axis=1)

data['total_above_prev_10days_min'] = data['currency1_total_above_prev_10days_min'] + data['currency2_total_above_prev_10days_min']
data['total_below_prev_10days_max'] = data['currency1_total_below_prev_10days_max'] + data['currency2_total_below_prev_10days_max']
'''
bias_5days_columns = [c for c in data.columns if c.find('bias_5days') != -1]
bias_10days_columns = [c for c in data.columns if c.find('bias_10days') != -1]
bias_30days_columns = [c for c in data.columns if c.find('bias_30days') != -1]
bias_45days_columns = [c for c in data.columns if c.find('bias_45days') != -1]
bias_60days_columns = [c for c in data.columns if c.find('bias_60days') != -1]
data['max_bias_5days'] = data[bias_5days_columns].max(axis=1)
data['max_bias_10days'] = data[bias_10days_columns].max(axis=1)
data['max_bias_30days'] = data[bias_30days_columns].max(axis=1)
data['max_bias_45days'] = data[bias_45days_columns].max(axis=1)
data['max_bias_60days'] = data[bias_60days_columns].max(axis=1)
data['min_bias_5days'] = data[bias_5days_columns].min(axis=1)
data['min_bias_10days'] = data[bias_10days_columns].min(axis=1)
data['min_bias_30days'] = data[bias_30days_columns].min(axis=1)
data['min_bias_45days'] = data[bias_45days_columns].min(axis=1)
data['min_bias_60days'] = data[bias_60days_columns].min(axis=1)
data['total_up_bias_5days'] = (data[bias_5days_columns] > 0).astype(int).sum(axis=1)
data['total_up_bias_10days'] = (data[bias_10days_columns] > 0).astype(int).sum(axis=1)
data['total_up_bias_30days'] = (data[bias_30days_columns] > 0).astype(int).sum(axis=1)
data['total_up_bias_45days'] = (data[bias_45days_columns] > 0).astype(int).sum(axis=1)
data['total_up_bias_60days'] = (data[bias_60days_columns] > 0).astype(int).sum(axis=1)
data['total_mean_bias_5days'] = data[bias_5days_columns].mean(axis=1)
data['total_mean_bias_10days'] = data[bias_10days_columns].mean(axis=1)
data['total_mean_bias_30days'] = data[bias_30days_columns].mean(axis=1)
data['total_mean_bias_45days'] = data[bias_45days_columns].mean(axis=1)
data['total_mean_bias_60days'] = data[bias_60days_columns].mean(axis=1)
'''


In [None]:
#data.dropna(axis=0, thresh=100, inplace=True)
print(data.shape, data.isnull().sum())
data.dropna(axis=0, thresh= 60, inplace=True)
data

In [None]:
data = data.iloc[60:,:]

## Prepare training and test data

In [None]:
data1 = data[data[currency + '_next5days_change'].notna()].copy()
target = currency + '_next5days_change'
X = data1.drop(columns=[c for c in data1.columns if c.find('next5days') != -1])
y = data1[target]
X.dtypes

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train

###target training data distribution

In [None]:
y_train.plot(kind='hist')

# Ridge Regression

## Baseline error

In [None]:
baseline = mean_absolute_error(y_train, [y_train.mean()] * len(y_train))
print('Baseline error: ', baseline)

In [None]:
model_lr = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    Ridge(random_state=42)
)
model_lr.fit(X_train, y_train)

## Ridge absolute error

In [None]:
print("train error: ", mean_absolute_error(y_train, model_lr.predict(X_train)))
print("validation error: ", mean_absolute_error(y_val, model_lr.predict(X_val)))


## Tuning Ridge Regression

In [None]:
clf = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    Ridge(random_state=42),
)

param_grid = {
    #'simpleimputer__strategy': ['mean', 'median'],
    'ridge__random_state': range(30,33,1),
    #'ridge__max_iter': range(1,20,2),
    'ridge__alpha': np.arange(16000,20000,20)
}

model_lrrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 10,
    verbose = 1,
    n_iter = 200
)

model_lrrs.fit(X_train, y_train)

In [None]:
best_score = model_lrrs.best_score_
best_params = model_lrrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned Ridge absolute error

In [None]:
train_error = mean_absolute_error(y_train, model_lrrs.predict(X_train))
validation_error = mean_absolute_error(y_val, model_lrrs.predict(X_val))
print("train error: ", train_error)
print("validation error: ", validation_error)

# Classified models

## prepare classified target: -1, 0 or 1 (sell, non, buy)

## Baseline score

In [None]:
y_logistic = data1[target].apply(lambda x: -1 if x <= -0.005 else(0 if x < 0.005 else 1))
X_train, X_val, y_train_c, y_val_c = train_test_split(X, y_logistic, test_size=0.2)
X_train.dtypes, y_train_c.dtypes

In [None]:
baseline = y_train_c.value_counts(normalize=True).max()
print('Baseline: ', baseline)

# Logistic Regression

In [None]:
model_lg = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(random_state=42, n_jobs=-1)
)
model_lg.fit(X_train, y_train_c)

##Acuuracy score

In [None]:
print("train accuracy_score: ", accuracy_score(y_train_c, model_lg.predict(X_train)))
print("validation accuracy_score: ", accuracy_score(y_val_c, model_lg.predict(X_val)))

## Tuning LogisticRegression

In [None]:
clf = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(random_state=42, n_jobs=-1),
)

param_grid = {
   'logisticregression__max_iter': range(60, 400, 10),
}

model_lgrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 10,
    verbose = 1,
    n_iter = 200
)

model_lgrs.fit(X_train, y_train_c)

In [None]:
best_score = model_lgrs.best_score_
best_params = model_lgrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_lgrs.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_lgrs.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

# RandomForest Classifier

In [None]:
model_rf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1,
                           random_state=42)
)

model_rf.fit(X_train, y_train_c)

##Accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_rf.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_rf.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

In [None]:
X.select_dtypes(include='object')

## Feature importance

In [None]:
importances = model_rf.named_steps['randomforestclassifier'].feature_importances_
columns = X_train.columns
df_importances = pd.DataFrame(data=importances, index=X_train.columns, columns=["importance"])
df_importances.abs().sort_values(by=['importance']).tail(10).plot(kind="barh")


In [None]:
df_importances.abs().sort_values(by=['importance'])[-40:]

## Tuning RandomForestClassifier

In [None]:
clf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1, random_state=42)
)

param_grid = {
    #'randomforestclassifier__random_state': range(30,60,2),
    'randomforestclassifier__n_estimators': range(400, 600, 10),
    'randomforestclassifier__max_depth': range(5,40,2),
    #'randomforestclassifier__min_samples_split': range(3,8,1),
    #'randomforestclassifier__max_features': range(22,40,1)
}

model_rfrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 5,
    verbose = 1,
    n_iter = 50
)

model_rfrs.fit(X_train, y_train_c)

In [None]:
best_score = model_rfrs.best_score_
best_params = model_rfrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_rfrs.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_rfrs.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

# Gradient Boosting Classifier

In [None]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    GradientBoostingClassifier(random_state=42, n_estimators=100)
)

model_xgb.fit(X_train, y_train_c)

##Accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_xgb.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_xgb.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

## Tuning Gradient Boosting Classifier

In [None]:
clf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    GradientBoostingClassifier(random_state=42)
)

param_grid = {
    #'randomforestclassifier__random_state': range(30,60,2),
    'gradientboostingclassifier__n_estimators': range(80, 120, 10),
    #'gradientboostingclassifier__learning_rate': np.arange(0,0.2,0.02),
    'gradientboostingclassifier__max_depth': range(10,20,1),
    #'gradientboostingclassifier__max_features': range(10,40,2)
}

model_gbrs = RandomizedSearchCV(
    clf,
    param_distributions = param_grid,
    n_jobs = -1,
    cv = 5,
    verbose = 1,
    n_iter = 100
)

model_gbrs.fit(X_train, y_train_c)

In [None]:
best_score = model_gbrs.best_score_
best_params = model_gbrs.best_params_

print('Best score for `model`:', best_score)
print('Best params for `model`:', best_params)

###Tuned accuracy score

In [None]:
train_accuracy = accuracy_score(y_train_c, model_gbrs.predict(X_train))
validation_accuracy = accuracy_score(y_val_c, model_gbrs.predict(X_val))
print("train accuracy: ", train_accuracy)
print("validation accuracy: ", validation_accuracy)

### Confusion Matrix

In [None]:
plot_confusion_matrix(
    model_gbrs,
    X_val,
    y_val_c,
    values_format = '.0f',
    display_labels = ['sell', 'None', 'buy']
)

#### Precision

In [None]:
print('Sell precision: ', 151/(150 + 35 + 32))
print('Buy precision: ', 137/(137 + 36 + 50))