## Back Test Data Generation

In [1]:
import requests
import time
import calendar
import dateutil.parser as parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import warnings
import yaml
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pytz
warnings.filterwarnings('ignore')

### Settings

In [2]:
with open ('back_test_pipeline_settings.yaml') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
    k_number = cfg['knn']['k_number']
    metric = cfg['knn']['metric']
    algorithm = cfg['knn']['algorithm']
    feature_1 = cfg['knn']['feature_1']
    feature_2 = cfg['knn']['feature_2']
    feature_3 = cfg['knn']['feature_3']
    feature_7 = cfg['knn']['feature_7']
    feature_8 = cfg['knn']['feature_8']
    feature_15 = cfg['knn']['feature_15']
    volume = cfg['feature']['volume']
    volume_size = cfg['sample']['volume_size']
    sample_count = cfg['sample']['count']
    candles = cfg['recommendation']['candle_count']
    pair = cfg['currency']['pair']
    instrument = cfg['currency']['instrument']

In [3]:
print('K Number:',k_number)
print('Metric:', metric)
print('Algorithm:', algorithm)
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)
print('Future Candle Count:', candles)
print('Pair:', pair)
print('Instrument:', instrument)

K Number: 5
Metric: euclidean
Algorithm: brute
Candle Volume Size: 1100
Random Sample Count: 5000
Future Candle Count: 7
Pair: USDCHF
Instrument: USD_CHF


In [4]:
def convert_date(utc_time): 
    parsed_date = parser.parse(utc_time)
    var_date=parsed_date.date()
    var_time=parsed_date.time()
    var_f_time=var_time.hour
    var_julian_date=parsed_date.timetuple().tm_yday
    var_weekday=parsed_date.weekday()
    var_weekday_name=calendar.day_name[parsed_date.weekday()]
    return var_date, var_time, var_f_time, var_julian_date, var_weekday, var_weekday_name

In [5]:
def find_k_similar_candles(candle_id, dataset, k = k_number):
    indices=[]
    distances = []
    output = []
    model_knn = NearestNeighbors(metric = metric, algorithm = algorithm) 
    model_knn.fit(dataset)
    
    #metric = 'euclidean' or 'cosine' or 'manhattan' or 'mahalanobis'
    
    distances, indices = model_knn.kneighbors(dataset.iloc[candle_id,:].values.reshape(1,-1),
                                              n_neighbors = k)

    for i in range(0,len(distances.flatten())):
        if i!=0:
            
            output.append ([dataset.index[indices.flatten()[i]],
                            distances.flatten()[i],
                            dataset.iloc[indices.flatten()[i]][feature_1],
                            dataset.iloc[indices.flatten()[i]][feature_2],
                            dataset.iloc[indices.flatten()[i]][feature_3],
                            dataset.iloc[indices.flatten()[i]][feature_7],
                            dataset.iloc[indices.flatten()[i]][feature_8],                            
                           ])
    
    output = pd.DataFrame(output)
    output.columns = ['Indice','Distance',
                      feature_1,
                      feature_2,
                      feature_3,
                      feature_7,
                      feature_8,
                     ]
   # display (output)
    
    return indices, distances

# <font color='red'>Test Configs</font>

In [6]:
filename = '{}_H4.csv'.format(instrument)
data = pd.read_csv(filename)

In [7]:
data.columns

Index(['Date', 'Time', 'f_time', 'julian_date', 'Weekday', 'Weekday_Name',
       'UTC_Time', 'Volume', 'Open', 'High', 'Low', 'Close', 'SMA_5', 'SMA_10',
       'SMA_20', 'F_SMA_5', 'F_SMA_10', 'F_SMA_20', 'O-H', 'O-L', 'O-C', 'H-L',
       'H-C', 'L-C', 'Direction', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5',
       'Trend'],
      dtype='object')

In [8]:
data.shape

(9640, 31)

In [9]:
data.head()

Unnamed: 0,Date,Time,f_time,julian_date,Weekday,Weekday_Name,UTC_Time,Volume,Open,High,...,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5,Trend
0,2015-12-31,02:00:00,2,365,3,Thursday,2015-12-31T02:00:00.000000000Z,318,0.98828,0.98914,...,0.00105,0.0001,-0.00095,1,-0.00076,0.00086,0.00095,0.00042,0.00099,0.0
1,2015-12-31,06:00:00,6,365,3,Thursday,2015-12-31T06:00:00.000000000Z,1149,0.98904,0.9909,...,0.00233,0.00047,-0.00186,1,-0.00139,0.00186,0.00186,-0.00076,0.00042,0.0
2,2015-12-31,10:00:00,10,365,3,Thursday,2015-12-31T10:00:00.000000000Z,1922,0.99048,0.99626,...,0.0058,0.0,-0.0058,1,-0.00578,0.00578,0.0058,-0.00139,-0.00076,0.0
3,2015-12-31,14:00:00,14,365,3,Thursday,2015-12-31T14:00:00.000000000Z,3965,0.99632,1.00212,...,0.00596,0.0024,-0.00356,1,-0.0034,0.0058,0.00356,-0.00578,-0.00139,0.0
4,2015-12-31,18:00:00,18,365,3,Thursday,2015-12-31T18:00:00.000000000Z,462,0.99971,1.0004,...,0.00206,0.00066,-0.0014,1,-3e-05,0.00069,0.0014,-0.0034,-0.00578,0.0


In [10]:
data.describe()

Unnamed: 0,f_time,julian_date,Weekday,Volume,Open,High,Low,Close,SMA_5,SMA_10,...,H-L,H-C,L-C,Direction,col_1,col_2,col_3,col_4,col_5,Trend
count,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,...,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0,9640.0
mean,11.374481,178.716805,2.070539,2650.555083,0.964456,0.965659,0.963224,0.964452,0.964464,0.964481,...,0.002435,0.001206,-0.001229,0.502075,4e-06,0.001203,0.001229,4e-06,4e-06,0.046058
std,6.852061,106.324657,1.545722,3326.227199,0.035423,0.03546,0.035375,0.035425,0.035391,0.035355,...,0.001584,0.001213,0.001217,0.500022,0.0017,0.001241,0.001217,0.0017,0.0017,56.035601
min,1.0,1.0,0.0,1.0,0.87682,0.87856,0.87578,0.87678,0.8781,0.8787,...,0.0,0.0,-0.02111,0.0,-0.02071,0.0,0.0,-0.02071,-0.02071,-99.0
25%,5.0,84.0,1.0,844.0,0.929715,0.930907,0.928485,0.929708,0.9295,0.9294,...,0.00136,0.0004,-0.00161,0.0,-0.0008,0.0004,0.00044,-0.0008,-0.0008,-50.0
50%,13.0,178.0,2.0,1810.5,0.97372,0.974915,0.972385,0.97367,0.9736,0.9737,...,0.00208,0.00086,-0.0009,1.0,-1e-05,0.00086,0.0009,-1e-05,-1e-05,0.0
75%,17.0,271.0,3.0,3330.0,0.99287,0.99398,0.99174,0.992865,0.9929,0.992825,...,0.00304,0.00161,-0.00044,1.0,0.000803,0.0016,0.00161,0.000803,0.00081,50.0
max,22.0,366.0,6.0,97567.0,1.03111,1.03436,1.0295,1.03112,1.0294,1.0287,...,0.02207,0.01956,0.0,1.0,0.01766,0.02167,0.02111,0.01766,0.01766,99.0


In [11]:
volume_med = data['Volume'].median()
volume_med

1810.5

## Selecting n random candles where their volume is more than 5500

In [12]:
print('Candle Volume Size:', volume_size)
print('Random Sample Count:', sample_count)

Candle Volume Size: 1100
Random Sample Count: 5000


In [13]:
random_samples = data[data[volume] > volume_size].sample(n = sample_count)

In [14]:
#Random_Candles = np.random.randint(low=1, high=len(data)-40, size=1000)
Random_Candles = list(random_samples.index.values)

In [15]:
#show the fisrt 10 random generated candle numbers
Random_Candles[0:10]

[6829, 8856, 1622, 9510, 5451, 2008, 7657, 3306, 3866, 2513]

# <font color='red'>CANDLE LOOP</font>

In [16]:
CST = pytz.timezone('America/Chicago')
datetime_cst = datetime.now(CST)
print("Date & Time in CST : ", 
      datetime_cst.strftime('%Y:%m:%d %H:%M:%S %Z %z'))

Date & Time in CST :  2022:03:10 13:00:59 CST -0600


In [17]:
%%time

result_output = pd.DataFrame({'Candle_No':[],
                              'Current_Market_Fit':[],
                              'Current_Market':[],
                              
                              'Rec1_Close_Score':[],
                              'Rec1_High_Score':[],
                              'Rec1_Low_Score':[],
                              'Rec1_HH':[],
                              'Rec1_LL':[],
                              
                              'Rec2_Close_Score':[],
                              'Rec2_High_Score':[],
                              'Rec2_Low_Score':[],
                              'Rec2_HH':[],
                              'Rec2_LL':[],
                              
                              'Rec3_Close_Score':[],
                              'Rec3_High_Score':[],
                              'Rec3_Low_Score':[],
                              'Rec3_HH':[],
                              'Rec3_LL':[],
                              
                              'Rec4_Close_Score':[],
                              'Rec4_High_Score':[],
                              'Rec4_Low_Score':[],
                              'Rec4_HH':[],
                              'Rec4_LL':[],
                             })

for candle_no in Random_Candles:
    data = pd.read_csv(filename)
    data = data.iloc[candle_no:candle_no+candles]
    data['candleno'] = range (1, len(data) + 1)
    X = data['candleno'].values.reshape(-1, 1)
    Y = data['Close'].values.reshape(-1, 1)
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    y_pred = linear_regressor.predict(X) 
    
    Current_Market_Fit = r2_score(Y, y_pred)*100
    #print(Current_Market_Fit)
    coeficient = (linear_regressor.coef_)

    if coeficient > 0:
        Current_Market = 1  ## Bullish / Buy ##
    else:
        Current_Market = 0  ## Bearish / Sell ##
    
    data = pd.read_csv(filename)
    data = data[[feature_1,
                 feature_2,
                 feature_3,
                 feature_7,
                 feature_8,
                ]]

    indices, distances = find_k_similar_candles (candle_no,data)
    indices = indices[0:1][0]
    
    L_L = []
    H_H = []
    predicted_output_1 = []
    predicted_output_2 = []
    predicted_output_3 = []

    for indice in indices[1:5]:
        #print (indice)
        
        data = pd.read_csv(filename) 
        data = data.iloc[indice:indice+candles]
        
        HH = data.iloc[0]['Close'] - data['High'].max()
        LL = data.iloc[0]['Close'] - data['Low'].min()
        
        L_L.append([LL])
        H_H.append([HH])

        #print (HH.round(4), LL.round(4))

        data['candleno'] = range (1, len(data) + 1)
        X = data['candleno'].values.reshape(-1, 1)
        
        Y_Close = data['Close'].values.reshape(-1, 1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(X, Y_Close)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit_Close = r2_score(Y_Close, y_pred)*100
        coeficient_close = (linear_regressor.coef_)
        if coeficient_close > 0:
            Predicted_Trade_Close = 1    ## Buy ##
        else:
            Predicted_Trade_Close = -1   ## Sell ##
        
        predicted_output_1.append([Predicted_Market_Fit_Close * Predicted_Trade_Close, HH.round(4), LL.round(4)])
        #print ('****Close****', Predicted_Market_Fit_Close * Predicted_Trade_Close)
        
        
        Y_High = data['High'].values.reshape(-1, 1)
        linear_regressor.fit(X, Y_High)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit_High= r2_score(Y_High, y_pred)*100
        coeficient_high = (linear_regressor.coef_)
        
        if coeficient_high > 0:
            Predicted_Trade_High = 1    ## Buy ##
        else:
            Predicted_Trade_High = -1   ## Sell ##            
        
        predicted_output_2.append([Predicted_Market_Fit_High * Predicted_Trade_High])
        #print ('****High****', Predicted_Market_Fit_High * Predicted_Trade_High)
        
        
        Y_Low = data['Low'].values.reshape(-1, 1)
        linear_regressor.fit(X, Y_Low)
        y_pred = linear_regressor.predict(X)
        Predicted_Market_Fit_Low= r2_score(Y_Low, y_pred)*100
        coeficient_low = (linear_regressor.coef_)
        if coeficient_low > 0:
            Predicted_Trade_Low = 1    ## Buy ##
        else:
            Predicted_Trade_Low = -1   ## Sell ##
        
        predicted_output_3.append([Predicted_Market_Fit_Low * Predicted_Trade_Low])
        #print ('****Low****', Predicted_Market_Fit_Low * Predicted_Trade_Low)
        
    
    result = {'Candle_No': candle_no,
              'Current_Market_Fit': Current_Market_Fit,
              'Current_Market': Current_Market,
              
              'Rec1_Close_Score': predicted_output_1[0][0],
              'Rec1_High_Score': predicted_output_2[0][0],
              'Rec1_Low_Score': predicted_output_3[0][0],
              'Rec1_HH': H_H[0][0],
              'Rec1_LL': L_L[0][0],
              
              'Rec2_Close_Score': predicted_output_1[1][0],
              'Rec2_High_Score': predicted_output_2[1][0],
              'Rec2_Low_Score': predicted_output_3[1][0],
              'Rec2_HH': H_H[1][0],
              'Rec2_LL': L_L[1][0],
              
              'Rec3_Close_Score': predicted_output_1[2][0],
              'Rec3_High_Score': predicted_output_2[2][0],
              'Rec3_Low_Score': predicted_output_3[2][0],
              'Rec3_HH': H_H[2][0],
              'Rec3_LL': L_L[2][0],
              
              'Rec4_Close_Score': predicted_output_1[3][0],
              'Rec4_High_Score': predicted_output_2[3][0],
              'Rec4_Low_Score': predicted_output_3[3][0],
              'Rec4_HH': H_H[3][0],
              'Rec4_LL': L_L[3][0],
             }
    
    result_output = result_output.append(result, ignore_index = True)

65
-0.0034 0.0033
****Close**** 73.47413936470136
****High**** 21.0092898220261
****Low**** 71.67210080974031
179
-0.0048 0.0028
****Close**** 89.5639837451968
****High**** 75.54545852971262
****Low**** 93.21230995032404
3678
-0.0037 0.0025
****Close**** 10.899560645905893
****High**** 72.22310212574301
****Low**** 15.593642329113688
5625
-0.0048 0.0062
****Close**** 45.96935372983585
****High**** 69.96794629504276
****Low**** 45.91030841222842
8771
-0.0021 0.0008
****Close**** -9.757946930117855
****High**** -48.13723214502993
****Low**** 1.1418325941210283
6771
-0.0014 0.0051
****Close**** -81.99561869671024
****High**** -62.3120669290386
****Low**** -76.1083290015967
4594
-0.0054 0.0009
****Close**** 59.68888362535729
****High**** 35.22397491457044
****Low**** 91.67655534836328
946
-0.0015 0.0068
****Close**** -90.05380599809361
****High**** -97.3472617317302
****Low**** -94.2852159436978
1871
-0.0022 0.0042
****Close**** -66.38841875617672
****High**** -91.15778136779636
****Low***

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
now = datetime.now()
today = now.strftime("%d-%m-%Y_%I-%M_%p")

In [None]:
result_output.to_csv('01_Back_Test_Data.csv', header = True, index = False)
result_output.to_csv(today + "_" + "Back_Test_Data_" + pair + '.csv', header = True, index = False)
result_output = pd.read_csv(today + "_" + "Back_Test_Data_" + pair + '.csv')

In [None]:
result_output.head(10)

In [None]:
result_output.shape

In [None]:
result_output.isnull().sum()

#### Generating Log File

In [None]:
file = open(today + "_" + "data_generation_log_" + pair + '.txt', "w")
file.write ("Date: " + today + "\n" + \
            "Currency Pair: " + pair + "\n" + \
            "K_Number: " + str(k_number) + "\n" + \
            "KNN_Metric: " + metric + "\n" + \
            "KNN_Algorithm: " + algorithm + "\n" + \
            "Feature: " + feature_1 + "\n" + \
            "Feature: " + feature_2 + "\n" + \
            "Feature: " + feature_3 + "\n" + \
            "Feature: " + feature_7 + "\n" + \
            "Feature: " + feature_8 + "\n" + \
            "Volume Size: " + str(volume_size) + "\n" + \
            "Sample Count: " + str(sample_count) + "\n" + \
            "Candle Counts: " + str(candles) + "\n"
           )
file.close()