## Back Test Data Generation

In [1]:
import requests
import time
import calendar
import dateutil.parser as parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import warnings
import yaml
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pytz
warnings.filterwarnings('ignore')

### Settings

In [2]:
with open ('back_test_pipeline_settings.yaml') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
    k_number = cfg['knn']['k_number']
    metric = cfg['knn']['metric']
    algorithm = cfg['knn']['algorithm']
    feature_1 = cfg['knn']['feature_1']
    feature_2 = cfg['knn']['feature_2']
    feature_3 = cfg['knn']['feature_3']
    feature_7 = cfg['knn']['feature_7']
    feature_8 = cfg['knn']['feature_8']
    feature_15 = cfg['knn']['feature_15']
    volume = cfg['feature']['volume']
    volume_size = cfg['sample']['volume_size']
    sample_count = cfg['sample']['count']
    candles = cfg['recommendation']['candle_count']
    pair = cfg['currency']['pair']
    instrument = cfg['currency']['instrument']

In [3]:
print('K Number:',k_number)
print('Metric:', metric)
print('Algorithm:', algorithm)
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)
print('Future Candle Count:', candles)
print('Pair:', pair)
print('Instrument:', instrument)

K Number: 5
Metric: euclidean
Algorithm: brute
Candle Volume Size: 5500
Random Sample Count: 4000
Future Candle Count: 7
Pair: EURUSD
Instrument: EUR_USD


In [4]:
def convert_date(utc_time): 
    parsed_date = parser.parse(utc_time)
    var_date=parsed_date.date()
    var_time=parsed_date.time()
    var_f_time=var_time.hour
    var_julian_date=parsed_date.timetuple().tm_yday
    var_weekday=parsed_date.weekday()
    var_weekday_name=calendar.day_name[parsed_date.weekday()]
    return var_date, var_time, var_f_time, var_julian_date, var_weekday, var_weekday_name

In [5]:
def find_k_similar_candles(candle_id, dataset, k = k_number):
    indices=[]
    distances = []
    output = []
    model_knn = NearestNeighbors(metric = metric, algorithm = algorithm) 
    model_knn.fit(dataset)
    
    #metric = 'euclidean' or 'cosine' or 'manhattan' or 'mahalanobis'
    
    distances, indices = model_knn.kneighbors(dataset.iloc[candle_id,:].values.reshape(1,-1),
                                              n_neighbors = k)

    for i in range(0,len(distances.flatten())):
        if i!=0:
            
            output.append ([dataset.index[indices.flatten()[i]],
                            distances.flatten()[i],
                            dataset.iloc[indices.flatten()[i]][feature_1],
                            dataset.iloc[indices.flatten()[i]][feature_2],
                            dataset.iloc[indices.flatten()[i]][feature_3],
                            dataset.iloc[indices.flatten()[i]][feature_15],
#                            dataset.iloc[indices.flatten()[i]][feature_8],                            
                           ])
    
    output = pd.DataFrame(output)
    output.columns = ['Indice','Distance',
                      feature_1,
                      feature_2,
                      feature_3,
                      feature_15,
#                      feature_8,
                     ]
   # display (output)
    
    return indices, distances

# <font color='red'>Test Configs</font>

In [6]:
filename = '{}_H4.csv'.format(instrument)
data = pd.read_csv(filename)

In [7]:
data.columns

Index(['Date', 'Time', 'f_time', 'julian_date', 'Weekday', 'Weekday_Name',
       'UTC_Time', 'Volume', 'Open', 'High', 'Low', 'Close', 'SMA_5', 'SMA_10',
       'SMA_20', 'F_SMA_5', 'F_SMA_10', 'F_SMA_20', 'O-H', 'O-L', 'O-C', 'H-L',
       'H-C', 'L-C', 'Direction', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5',
       'Trend'],
      dtype='object')

In [8]:
data.shape

(9651, 31)

In [9]:
data.head()

Unnamed: 0,Date,Time,f_time,julian_date,Weekday,Weekday_Name,UTC_Time,Volume,Open,High,...,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5,Trend
0,2015-12-10,18:00:00,18,344,3,Thursday,2015-12-10T18:00:00.000000000Z,3356,1.09469,1.09566,...,0.00264,0.00157,-0.00107,0,0.0006,0.00097,0.00107,0.00236,-0.00032,0.0
1,2015-12-10,22:00:00,22,344,3,Thursday,2015-12-10T22:00:00.000000000Z,1840,1.09407,1.09477,...,0.00211,0.00153,-0.00058,0,0.00083,0.0007,0.00058,0.0006,0.00236,0.0
2,2015-12-11,02:00:00,2,345,4,Friday,2015-12-11T02:00:00.000000000Z,829,1.09327,1.09442,...,0.00166,0.00068,-0.00098,1,-0.00047,0.00115,0.00098,0.00083,0.0006,0.0
3,2015-12-11,06:00:00,6,345,4,Friday,2015-12-11T06:00:00.000000000Z,5337,1.0937,1.09703,...,0.00351,0.00268,-0.00083,1,-0.00065,0.00333,0.00083,-0.00047,0.00083,0.0
4,2015-12-11,10:00:00,10,345,4,Friday,2015-12-11T10:00:00.000000000Z,10131,1.09437,1.09781,...,0.00429,0.00253,-0.00176,1,-0.00091,0.00344,0.00176,-0.00065,-0.00047,0.0


In [10]:
data.describe()

Unnamed: 0,f_time,julian_date,Weekday,Volume,Open,High,Low,Close,SMA_5,SMA_10,...,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5,Trend
count,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,...,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0,9651.0
mean,11.368666,180.882499,2.067351,9087.967465,1.143035,1.144483,1.141615,1.14304,1.143036,1.143029,...,0.002869,0.001443,-0.001425,0.505025,-5e-06,0.001449,0.001425,-5e-06,-6e-06,0.46389
std,6.849653,107.027989,1.54545,9438.474027,0.045309,0.04528,0.045304,0.045307,0.045273,0.04524,...,0.001988,0.001513,0.00146,0.500001,0.002069,0.001539,0.00146,0.002069,0.002068,56.315481
min,1.0,1.0,0.0,1.0,1.03695,1.03915,1.03406,1.03698,1.0387,1.0395,...,0.0,0.0,-0.02835,0.0,-0.02738,0.0,0.0,-0.02738,-0.02738,-99.0
25%,5.0,86.0,1.0,2892.0,1.11171,1.1131,1.110165,1.11174,1.11175,1.11155,...,0.00154,0.00049,-0.00189,0.0,-0.00097,0.00046,0.00048,-0.00097,-0.00097,-50.0
50%,13.0,180.0,2.0,5673.0,1.13516,1.13664,1.13368,1.13513,1.135,1.1349,...,0.00238,0.001,-0.00101,1.0,-2e-05,0.00101,0.00101,-2e-05,-2e-05,0.0
75%,17.0,274.0,3.0,12001.0,1.1787,1.180055,1.17727,1.178685,1.1786,1.1785,...,0.00363,0.0019,-0.00048,1.0,0.00091,0.00192,0.00189,0.00091,0.00091,52.0
max,22.0,366.0,6.0,107306.0,1.25493,1.2556,1.25234,1.25494,1.2514,1.2491,...,0.03852,0.03486,0.0,1.0,0.03281,0.02982,0.02835,0.03281,0.03281,99.0


In [11]:
volume_med = data['Volume'].median()
volume_med

5673.0

## Selecting n random candles where their volume is more than 5500

In [12]:
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)

Candle Volume Size: 5500
Random Sample Count: 4000


In [13]:
random_samples = data[data[volume] > volume_size].sample(n = sample_count)

In [14]:
#Random_Candles = np.random.randint(low=1, high=len(data)-40, size=1000)
Random_Candles = list(random_samples.index.values)

In [15]:
#show the fisrt 10 random generated candle numbers
Random_Candles[0:10]

[851, 6600, 9216, 7297, 4589, 6396, 4638, 9057, 7370, 9332]

# <font color='red'>CANDLE LOOP</font>

In [16]:
CST = pytz.timezone('America/Chicago')
datetime_cst = datetime.now(CST)
print("Date & Time in CST : ", 
      datetime_cst.strftime('%Y:%m:%d %H:%M:%S %Z %z'))

Date & Time in CST :  2022:02:24 09:47:14 CST -0600


In [17]:
%%time

result_output = pd.DataFrame({'Candle_No':[],
                              'Current_Market_Fit':[],
                              'Current_Market':[],
                              
                              'Rec1_Close_Score':[],
                              'Rec1_High_Score':[],
                              'Rec1_Low_Score':[],
                              'Rec1_HH':[],
                              'Rec1_LL':[],
                              
                              'Rec2_Close_Score':[],
                              'Rec2_High_Score':[],
                              'Rec2_Low_Score':[],
                              'Rec2_HH':[],
                              'Rec2_LL':[],
                              
                              'Rec3_Close_Score':[],
                              'Rec3_High_Score':[],
                              'Rec3_Low_Score':[],
                              'Rec3_HH':[],
                              'Rec3_LL':[],
                              
                              'Rec4_Close_Score':[],
                              'Rec4_High_Score':[],
                              'Rec4_Low_Score':[],
                              'Rec4_HH':[],
                              'Rec4_LL':[],
                             })

for candle_no in Random_Candles:
    data = pd.read_csv(filename)
    data = data.iloc[candle_no:candle_no+candles]
    data['candleno'] = range (1, len(data) + 1)
    X = data['candleno'].values.reshape(-1, 1)
    Y = data['Close'].values.reshape(-1, 1)
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    y_pred = linear_regressor.predict(X) 
    
    Current_Market_Fit = r2_score(Y, y_pred)*100
    #print(Current_Market_Fit)
    coeficient = (linear_regressor.coef_)

    if coeficient > 0:
        Current_Market = 1  ## Bullish / Buy ##
    else:
        Current_Market = 0  ## Bearish / Sell ##
    
    data = pd.read_csv(filename)
    data = data[[feature_1,
                 feature_2,
                 feature_3,
                 feature_15,
#                 feature_8,
                ]]

    indices, distances = find_k_similar_candles (candle_no,data)
    indices = indices[0:1][0]
    
    predicted_output_1 = []
    predicted_output_2 = []
    predicted_output_3 = []

    for indice in indices[1:5]:
             
        Predicted_Market_Fit =0
        Predicted_Trade=''
    
        data = pd.read_csv(filename) 
        data = data.iloc[indice:indice+candles]
        
        HH = data.iloc[0]['Close'] - data['High'].max()
        LL = data.iloc[0]['Close'] - data['Low'].min()
#        print("Close:", data.iloc[0]['Close'] , "High: ", data['High'].max(), 'Low: ', data['Low'].min())

        data['candleno'] = range (1, len(data) + 1)
        X = data['candleno'].values.reshape(-1, 1)
        
        Y = data['Close'].values.reshape(-1, 1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit= r2_score(Y, y_pred)*100
        coeficient = (linear_regressor.coef_)
        if coeficient > 0:
            Predicted_Trade = 1    ## Buy ##
        else:
            Predicted_Trade = -1   ## Sell ##
            
        predicted_output_1.append([Predicted_Market_Fit * Predicted_Trade, HH.round(4), LL.round(4)])
        
        Y = data['High'].values.reshape(-1, 1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit= r2_score(Y, y_pred)*100
        coeficient = (linear_regressor.coef_)
        if coeficient > 0:
            Predicted_Trade = 1    ## Buy ##
        else:
            Predicted_Trade = -1   ## Sell ##            
        
#        print (int(Predicted_Market_Fit) * Predicted_Trade)
        predicted_output_2.append([Predicted_Market_Fit * Predicted_Trade])
        
        Y = data['Low'].values.reshape(-1, 1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit= r2_score(Y, y_pred)*100
        coeficient = (linear_regressor.coef_)
        if coeficient > 0:
            Predicted_Trade = 1    ## Buy ##
        else:
            Predicted_Trade = -1   ## Sell ##
        
#        print (int(Predicted_Market_Fit) * Predicted_Trade)
        predicted_output_3.append([Predicted_Market_Fit * Predicted_Trade])
        
    
    result = {'Candle_No': candle_no,
              'Current_Market_Fit': Current_Market_Fit,
              'Current_Market': Current_Market,
              
              'Rec1_Close_Score': predicted_output_1[0][0],
              'Rec1_High_Score': predicted_output_2[0][0],
              'Rec1_Low_Score': predicted_output_3[0][0],
              'Rec1_HH': predicted_output_1[0][1],
              'Rec1_LL': predicted_output_1[0][2],
              
              'Rec2_Close_Score': predicted_output_1[0][0],
              'Rec2_High_Score': predicted_output_2[0][0],
              'Rec2_Low_Score': predicted_output_3[0][0],
              'Rec2_HH': predicted_output_1[1][1],
              'Rec2_LL': predicted_output_1[1][2],
              
              'Rec3_Close_Score': predicted_output_1[0][0],
              'Rec3_High_Score': predicted_output_2[0][0],
              'Rec3_Low_Score': predicted_output_3[0][0],
              'Rec3_HH': predicted_output_1[2][1],
              'Rec3_LL': predicted_output_1[2][2],
              
              'Rec4_Close_Score': predicted_output_1[0][0],
              'Rec4_High_Score': predicted_output_2[0][0],
              'Rec4_Low_Score': predicted_output_3[0][0],
              'Rec4_HH': predicted_output_1[3][1],
              'Rec4_LL': predicted_output_1[3][2],
             }
    
    result_output = result_output.append(result, ignore_index = True)

CPU times: user 2h 54min 4s, sys: 4h 22min 40s, total: 7h 16min 44s
Wall time: 23min 30s


In [18]:
now = datetime.now()
today = now.strftime("%d-%m-%Y_%I-%M_%p")

In [19]:
result_output.to_csv('01_Back_Test_Data.csv', header = True, index = False)
result_output.to_csv(today + "_" + "Back_Test_Data_" + pair + '.csv', header = True, index = False)
result_output = pd.read_csv(today + "_" + "Back_Test_Data_" + pair + '.csv')

In [20]:
result_output.head(10)

Unnamed: 0,Candle_No,Current_Market_Fit,Current_Market,Rec1_Close_Score,Rec1_High_Score,Rec1_Low_Score,Rec1_HH,Rec1_LL,Rec2_Close_Score,Rec2_High_Score,...,Rec3_Close_Score,Rec3_High_Score,Rec3_Low_Score,Rec3_HH,Rec3_LL,Rec4_Close_Score,Rec4_High_Score,Rec4_Low_Score,Rec4_HH,Rec4_LL
0,851.0,14.711971,0.0,-0.068499,40.680443,27.203035,-0.0022,0.0033,-0.068499,40.680443,...,-0.068499,40.680443,27.203035,-0.0026,0.0034,-0.068499,40.680443,27.203035,-0.0016,0.0035
1,6600.0,8.748818,0.0,-78.38952,-43.819763,-80.580116,-0.009,0.009,-78.38952,-43.819763,...,-78.38952,-43.819763,-80.580116,-0.004,0.0054,-78.38952,-43.819763,-80.580116,-0.0033,0.0028
2,9216.0,80.176852,0.0,-43.103848,-72.397175,-62.511593,-0.0027,0.0021,-43.103848,-72.397175,...,-43.103848,-72.397175,-62.511593,-0.0027,0.0039,-43.103848,-72.397175,-62.511593,-0.0041,0.0005
3,7297.0,77.682633,1.0,-11.726447,-24.390923,7.552211,-0.0035,0.0031,-11.726447,-24.390923,...,-11.726447,-24.390923,7.552211,-0.0079,0.0022,-11.726447,-24.390923,7.552211,-0.0044,0.0034
4,4589.0,0.152034,0.0,2.644918,17.888699,-3.529665,-0.0059,0.006,2.644918,17.888699,...,2.644918,17.888699,-3.529665,-0.0041,0.0006,2.644918,17.888699,-3.529665,-0.0036,0.0008
5,6396.0,44.072199,0.0,-1.17286,-42.856098,-4.518868,-0.0028,0.0047,-1.17286,-42.856098,...,-1.17286,-42.856098,-4.518868,-0.0018,0.0042,-1.17286,-42.856098,-4.518868,-0.003,0.0041
6,4638.0,0.195912,0.0,-92.294095,-91.159908,-85.497043,-0.0012,0.0109,-92.294095,-91.159908,...,-92.294095,-91.159908,-85.497043,-0.002,0.0022,-92.294095,-91.159908,-85.497043,-0.0016,0.0027
7,9057.0,87.123909,0.0,-24.798552,-16.127632,-28.755598,-0.002,0.0035,-24.798552,-16.127632,...,-24.798552,-16.127632,-28.755598,-0.0006,0.0042,-24.798552,-16.127632,-28.755598,-0.0039,0.0059
8,7370.0,9.18303,1.0,-7.579346,-42.879359,-0.925754,-0.0013,0.0134,-7.579346,-42.879359,...,-7.579346,-42.879359,-0.925754,-0.0022,0.0018,-7.579346,-42.879359,-0.925754,-0.0016,0.0033
9,9332.0,0.001268,0.0,46.287637,46.480485,26.259795,-0.0049,0.0046,46.287637,46.480485,...,46.287637,46.480485,26.259795,-0.0022,0.0021,46.287637,46.480485,26.259795,-0.0016,0.0061


In [21]:
result_output.shape

(4000, 23)

In [24]:
result_output.isnull().sum()

Candle_No             0
Current_Market_Fit    1
Current_Market        0
Rec1_Close_Score      0
Rec1_High_Score       0
Rec1_Low_Score        0
Rec1_HH               0
Rec1_LL               0
Rec2_Close_Score      0
Rec2_High_Score       0
Rec2_Low_Score        0
Rec2_HH               0
Rec2_LL               0
Rec3_Close_Score      0
Rec3_High_Score       0
Rec3_Low_Score        0
Rec3_HH               0
Rec3_LL               0
Rec4_Close_Score      0
Rec4_High_Score       0
Rec4_Low_Score        0
Rec4_HH               0
Rec4_LL               0
dtype: int64

#### Generating Log File

In [22]:
file = open(today + "_" + "data_generation_log_" + pair + '.txt', "w")
file.write ("Date: " + today + "\n" + \
            "Currency Pair: " + pair + "\n" + \
            "K_Number: " + str(k_number) + "\n" + \
            "KNN_Metric: " + metric + "\n" + \
            "KNN_Algorithm: " + algorithm + "\n" + \
            "Feature: " + feature_1 + "\n" + \
            "Feature: " + feature_2 + "\n" + \
            "Feature: " + feature_3 + "\n" + \
            "Feature: " + feature_7 + "\n" + \
            "Feature: " + feature_8 + "\n" + \
            "Volume Size: " + str(volume_size) + "\n" + \
            "Sample Count: " + str(sample_count) + "\n" + \
            "Candle Counts: " + str(candles) + "\n"
           )
file.close()