## Back Test Data Generation

In [1]:
import requests
import time
import calendar
import dateutil.parser as parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import warnings
import yaml
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pytz
warnings.filterwarnings('ignore')

### Settings

In [2]:
with open ('back_test_pipeline_settings.yaml') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
    k_number = cfg['knn']['k_number']
    metric = cfg['knn']['metric']
    algorithm = cfg['knn']['algorithm']
    feature_1 = cfg['knn']['feature_1']
    feature_2 = cfg['knn']['feature_2']
    feature_3 = cfg['knn']['feature_3']
    feature_7 = cfg['knn']['feature_7']
    feature_8 = cfg['knn']['feature_8']   
    volume = cfg['feature']['volume']
    volume_size = cfg['sample']['volume_size']
    sample_count = cfg['sample']['count']
    candles = cfg['recommendation']['candle_count']
    pair = cfg['currency']['pair']
    instrument = cfg['currency']['instrument']

In [3]:
print('K Number:',k_number)
print('Metric:', metric)
print('Algorithm:', algorithm)
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)
print('Future Candle Count:', candles)
print('Pair:', pair)
print('Instrument:', instrument)

K Number: 5
Metric: euclidean
Algorithm: brute
Candle Volume Size: 5500
Random Sample Count: 4000
Future Candle Count: 7
Pair: EURUSD
Instrument: EUR_USD


In [4]:
def convert_date(utc_time): 
    parsed_date = parser.parse(utc_time)
    var_date=parsed_date.date()
    var_time=parsed_date.time()
    var_f_time=var_time.hour
    var_julian_date=parsed_date.timetuple().tm_yday
    var_weekday=parsed_date.weekday()
    var_weekday_name=calendar.day_name[parsed_date.weekday()]
    return var_date, var_time, var_f_time, var_julian_date, var_weekday, var_weekday_name

In [5]:
def find_k_similar_candles(candle_id, dataset, k = k_number):
    indices=[]
    distances = []
    output = []
    model_knn = NearestNeighbors(metric = metric, algorithm = algorithm) 
    model_knn.fit(dataset)
    
    #metric = 'euclidean' or 'cosine' or 'manhattan' or 'mahalanobis'
    
    distances, indices = model_knn.kneighbors(dataset.iloc[candle_id,:].values.reshape(1,-1),
                                              n_neighbors = k)

    for i in range(0,len(distances.flatten())):
        if i!=0:
            
            output.append ([dataset.index[indices.flatten()[i]],
                            distances.flatten()[i],
                            dataset.iloc[indices.flatten()[i]][feature_1],
                            dataset.iloc[indices.flatten()[i]][feature_2],
                            dataset.iloc[indices.flatten()[i]][feature_3],
#                            dataset.iloc[indices.flatten()[i]][feature_7],
#                            dataset.iloc[indices.flatten()[i]][feature_8],                            
                           ])
    
    output = pd.DataFrame(output)
    output.columns = ['Indice','Distance',
                      feature_1,
                      feature_2,
                      feature_3,
#                      feature_7,
#                      feature_8,
                     ]
   # display (output)
    
    return indices, distances

# <font color='red'>Test Configs</font>

In [6]:
filename = '{}_H4.csv'.format(instrument)
data = pd.read_csv(filename)

In [7]:
data.columns

Index(['Date', 'Time', 'f_time', 'julian_date', 'Weekday', 'Weekday_Name',
       'UTC_Time', 'Volume', 'Open', 'High', 'Low', 'Close', 'SMA_5', 'SMA_10',
       'SMA_20', 'F_SMA_5', 'F_SMA_10', 'F_SMA_20', 'O-H', 'O-L', 'O-C', 'H-L',
       'H-C', 'L-C', 'Direction', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5'],
      dtype='object')

In [8]:
data.shape

(9647, 30)

In [9]:
data.head()

Unnamed: 0,Date,Time,f_time,julian_date,Weekday,Weekday_Name,UTC_Time,Volume,Open,High,...,O-C,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5
0,2015-12-03,02:00:00,2,337,3,Thursday,2015-12-03T02:00:00.000000000Z,1656,1.06019,1.06024,...,0.00143,0.00148,0.00148,0.0,0,0.00143,0.00148,0.00143,0.00138,-0.00299
1,2015-12-03,06:00:00,6,337,3,Thursday,2015-12-03T06:00:00.000000000Z,5040,1.05878,1.05948,...,0.00265,0.0042,0.00335,-0.00085,0,0.00265,0.00335,0.0035,0.00143,0.00138
2,2015-12-03,10:00:00,10,337,3,Thursday,2015-12-03T10:00:00.000000000Z,37142,1.05612,1.08934,...,-0.02629,0.03697,0.00693,-0.03004,1,-0.02629,0.00693,0.00375,0.00265,0.00143
3,2015-12-03,14:00:00,14,337,3,Thursday,2015-12-03T14:00:00.000000000Z,26330,1.0824,1.09424,...,-0.00778,0.01532,0.00406,-0.01126,1,-0.00778,0.00406,0.00348,-0.02629,0.00265
4,2015-12-03,18:00:00,18,337,3,Thursday,2015-12-03T18:00:00.000000000Z,11503,1.09018,1.09813,...,-0.00373,0.00795,0.00422,-0.00373,1,-0.00373,0.00422,0.0,-0.00778,-0.02629


In [10]:
data.describe()

Unnamed: 0,f_time,julian_date,Weekday,Volume,Open,High,Low,Close,SMA_5,SMA_10,...,O-C,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5
count,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,...,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0,9647.0
mean,11.366746,181.956567,2.067378,9001.125013,1.142878,1.144333,1.141457,1.142889,1.142874,1.142854,...,-1.1e-05,0.002876,0.001444,-0.001433,0.505546,-1.1e-05,0.001444,0.001422,-1.1e-05,-1.1e-05
std,6.849819,107.155833,1.54577,9291.374912,0.045437,0.045399,0.045435,0.045428,0.045406,0.045386,...,0.00209,0.002027,0.001516,0.001495,0.499995,0.00209,0.001516,0.001561,0.00209,0.00209
min,1.0,1.0,0.0,1.0,1.03695,1.03915,1.03406,1.03698,1.0387,1.0395,...,-0.02738,0.0,0.0,-0.03004,0.0,-0.02738,0.0,0.0,-0.02738,-0.02738
25%,5.0,88.0,1.0,2886.0,1.11105,1.112505,1.109655,1.1111,1.1111,1.1111,...,-0.00098,0.00154,0.00049,-0.0019,0.0,-0.00098,0.00049,0.00043,-0.00098,-0.00098
50%,13.0,182.0,2.0,5640.0,1.13498,1.13644,1.13361,1.13498,1.1348,1.1347,...,-2e-05,0.00238,0.001,-0.00101,1.0,-2e-05,0.001,0.00095,-2e-05,-2e-05
75%,17.0,275.0,3.0,11901.5,1.1787,1.18006,1.177275,1.17869,1.1786,1.1785,...,0.00091,0.00363,0.0019,-0.00048,1.0,0.00091,0.0019,0.0019,0.00091,0.00091
max,22.0,366.0,6.0,107306.0,1.25493,1.2556,1.25234,1.25494,1.2514,1.2491,...,0.03281,0.03852,0.03486,0.0,1.0,0.03281,0.03486,0.03647,0.03281,0.03281


In [11]:
volume_med = data['Volume'].median()
volume_med

5640.0

## Selecting n random candles where their volume is more than 5500

In [12]:
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)

Candle Volume Size: 5500
Random Sample Count: 4000


In [13]:
random_samples = data[data[volume] > volume_size].sample(n = sample_count)

In [14]:
#Random_Candles = np.random.randint(low=1, high=len(data)-40, size=1000)
Random_Candles = list(random_samples.index.values)

In [15]:
#show the fisrt 10 random generated candle numbers
Random_Candles[0:10]

[8317, 7211, 8658, 2109, 6977, 4606, 8353, 3335, 6988, 8530]

# <font color='red'>CANDLE LOOP</font>

In [16]:
CST = pytz.timezone('America/Chicago')
datetime_cst = datetime.now(CST)
print("Date & Time in CST : ", 
      datetime_cst.strftime('%Y:%m:%d %H:%M:%S %Z %z'))

Date & Time in CST :  2022:02:16 19:05:16 CST -0600


In [17]:
%%time

result_output = pd.DataFrame({'Candle_No':[],
                              'Current_Market_Fit':[],
                              'Current_Market':[],
                              
                              'Rec1_Score':[],
                              'Rec1_HH':[],
                              'Rec1_LL':[],
                              
                              'Rec2_Score':[],
                              'Rec2_HH':[],
                              'Rec2_LL':[],
                              
                              'Rec3_Score':[],
                              'Rec3_HH':[],
                              'Rec3_LL':[],
                              
                              'Rec4_Score':[],
                              'Rec4_HH':[],
                              'Rec4_LL':[],
                             })

for candle_no in Random_Candles:
    data = pd.read_csv(filename)
    data = data.iloc[candle_no:candle_no+candles]
    data['candleno'] = range (1, len(data) + 1)
    X = data['candleno'].values.reshape(-1, 1)
    Y = data['Close'].values.reshape(-1, 1)
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    y_pred = linear_regressor.predict(X) 
    
    Current_Market_Fit = int(r2_score(Y, y_pred)*100)
    #print(Current_Market_Fit)
    coeficient = (linear_regressor.coef_)

    if coeficient > 0:
        Current_Market = 1  ## Bullish / Buy ##
    else:
        Current_Market = 0  ## Bearish / Sell ##
    
    data = pd.read_csv(filename)
    data = data[[feature_1,
                 feature_2,
                 feature_3,
#                 feature_7,
#                 feature_8,
                ]]

    indices, distances = find_k_similar_candles (candle_no,data)
    indices = indices[0:1][0]
    
    predicted_output = []

    for indice in indices[1:5]:
             
        Predicted_Market_Fit =0
        Predicted_Trade=''
    
        data = pd.read_csv(filename) 
        data = data.iloc[indice:indice+candles]
        
        HH = data.iloc[0]['Close'] - data['High'].max()
        LL = data.iloc[0]['Close'] - data['Low'].min()
#        print("Close:", data.iloc[0]['Close'] , "High: ", data['High'].max(), 'Low: ', data['Low'].min())

        data['candleno'] = range (1, len(data) + 1)
        X = data['candleno'].values.reshape(-1, 1)
        Y = data['Close'].values.reshape(-1, 1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y)
        y_pred = linear_regressor.predict(X)

        Predicted_Market_Fit= r2_score(Y, y_pred)*100
        coeficient = (linear_regressor.coef_)

        if coeficient > 0:
            Predicted_Trade = 1    ## Buy ##
        else:
            Predicted_Trade = -1   ## Sell ##
        
        predicted_output.append([int(Predicted_Market_Fit) * Predicted_Trade, HH.round(4), LL.round(4)])
        
    result = {'Candle_No': candle_no,
              'Current_Market_Fit': Current_Market_Fit,
              'Current_Market': Current_Market,
              
              'Rec1_Score': predicted_output[0][0],
              'Rec1_HH': predicted_output[0][1],
              'Rec1_LL': predicted_output[0][2],
              
              'Rec2_Score': predicted_output[1][0],
              'Rec2_HH': predicted_output[1][1],
              'Rec2_LL': predicted_output[1][2],
              
              'Rec3_Score': predicted_output[2][0],
              'Rec3_HH': predicted_output[2][1],
              'Rec3_LL': predicted_output[2][2],
              
              'Rec4_Score': predicted_output[3][0],
              'Rec4_HH': predicted_output[3][1],
              'Rec4_LL': predicted_output[3][2],
             }
    
    result_output = result_output.append(result, ignore_index = True)

CPU times: user 2h 49min 59s, sys: 4h 12min 50s, total: 7h 2min 50s
Wall time: 24min 45s


In [18]:
now = datetime.now()
today = now.strftime("%d-%m-%Y_%I-%M_%p")

In [19]:
result_output.to_csv('01_Back_Test_Data.csv', header = True, index = False)
result_output.to_csv(today + "_" + "Back_Test_Data_" + pair + '.csv', header = True, index = False)
result_output = pd.read_csv(today + "_" + "Back_Test_Data_" + pair + '.csv')

In [20]:
result_output.head(10)

Unnamed: 0,Candle_No,Current_Market_Fit,Current_Market,Rec1_Score,Rec1_HH,Rec1_LL,Rec2_Score,Rec2_HH,Rec2_LL,Rec3_Score,Rec3_HH,Rec3_LL,Rec4_Score,Rec4_HH,Rec4_LL
0,8317.0,59.0,1.0,-37.0,-0.0006,0.0041,79.0,-0.0056,0.0015,78.0,-0.0051,0.0013,92.0,-0.0058,0.0012
1,7211.0,94.0,1.0,71.0,-0.0063,0.0033,-5.0,-0.0004,0.0068,-42.0,-0.0005,0.0086,29.0,-0.0028,0.0044
2,8658.0,7.0,1.0,-50.0,-0.0042,0.0057,-23.0,-0.0035,0.0017,-54.0,-0.0034,0.0043,-8.0,-0.0033,0.0042
3,2109.0,15.0,1.0,10.0,-0.0048,0.0012,0.0,-0.0014,0.0057,-55.0,-0.0054,0.0046,76.0,-0.0101,0.0013
4,6977.0,71.0,1.0,63.0,-0.005,0.0013,80.0,-0.0057,0.0018,-66.0,-0.0032,0.0028,46.0,-0.0042,0.0014
5,4606.0,3.0,1.0,90.0,-0.0045,0.0018,35.0,-0.0083,0.0006,3.0,-0.0031,0.0022,78.0,-0.0051,0.0006
6,8353.0,4.0,0.0,41.0,-0.0032,0.0028,-13.0,-0.0012,0.0026,-84.0,-0.0014,0.0073,-53.0,-0.0065,0.0058
7,3335.0,76.0,0.0,0.0,-0.0042,0.0058,-65.0,-0.0014,0.0036,24.0,-0.0037,0.0033,-48.0,-0.0009,0.0064
8,6988.0,1.0,1.0,64.0,-0.0062,0.0019,-64.0,-0.0017,0.0054,-42.0,-0.0017,0.0079,-70.0,-0.0016,0.0088
9,8530.0,60.0,0.0,-64.0,-0.0017,0.0049,-22.0,-0.0011,0.0041,31.0,-0.0052,0.0018,-58.0,-0.0012,0.0084


In [21]:
result_output.shape

(4000, 15)

#### Generating Log File

In [22]:
file = open(today + "_" + "data_generation_log_" + pair + '.txt', "w")
file.write ("Date: " + today + "\n" + \
            "Currency Pair: " + pair + "\n" + \
            "K_Number: " + str(k_number) + "\n" + \
            "KNN_Metric: " + metric + "\n" + \
            "KNN_Algorithm: " + algorithm + "\n" + \
            "Feature: " + feature_1 + "\n" + \
            "Feature: " + feature_2 + "\n" + \
            "Feature: " + feature_3 + "\n" + \
            "Feature: " + feature_7 + "\n" + \
            "Feature: " + feature_8 + "\n" + \
            "Volume Size: " + str(volume_size) + "\n" + \
            "Sample Count: " + str(sample_count) + "\n" + \
            "Candle Counts: " + str(candles) + "\n"
           )
file.close()