<a href="https://colab.research.google.com/github/DatainSociety/test/blob/master/DTW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sparkmagic
!pip install pyspark
# !pip install tensorflow
from functools import reduce
from pyspark import StorageLevel
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
from pyspark.sql import Window, Row, DataFrame

from pyspark.sql.types import StringType, FloatType
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
#from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from itertools import chain
# generate random floating point values
from random import seed, random, randint, uniform
import pandas as pd

# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
spark = SparkSession.builder.master("local[*]").getOrCreate()
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#CREATE SPARK SESSION

def UnionAll(dfs):
    data = reduce(DataFrame.unionAll,dfs)
    return data

def create_subsequence_data(feature_data,feature_selection_list,seq_len):
    subsequence_dict = {}
    smaller_sequence_count=0
    raw_total_sequences = 0 
    index_reference = 0
    step_size = 3
    feature_data.sort_values(by=['sequence_id','trade_date'],inplace=True)
    for name, chunk in feature_data.groupby(['sequence_id']):
        sequence_num = 0
        if len(chunk) > seq_len:
            i = 0
            while i*step_size + seq_len <= len(chunk):
                if i==0:
                    subsequence_chunk = chunk.iloc[0:i*step_size + seq_len,:]
                else:
                    subsequence_chunk = chunk.iloc[i*step_size:step_size+seq_len, :]
                subsequence_chunk['sequence_id'] = index_reference
                subsequence_chunk.sort_values(by=['trade_date'],inplace=True);subsequence_chunk.reset_index(inplace=True)
                subsequence_chunk = subsequence_chunk[['trade_date','sequence_id'] + feature_selection_list]
                subsequence_dict[index_reference] = subsequence_chunk
                sequence_num +=1
                index_reference+=1
                i+=1
            else:
                if len(chunk) > 5:
                    smaller_sequence_count +=1
                    small_chunk = chunk.copy()
                    small_chunk['sequence_id'] = index_reference
                    small_chunk.sort_values(by=['trade_date'],inplace=True)
                    small_chunk = small_chunk[['trade_date','sequence_id',]+feature_selection_list]
                    subsequence_dict[index_reference] = small_chunk
                    sequence_num +=1
                    index_reference+=1
            raw_total_sequences+=1
        if subsequence_dict:
            subsequence_array = pd.concat(subsequence_dict,axis=0)
        else:
            subsequence_array = pd.DataFrame(columns = ['trade_date','sequence_id'] + [x for x in feature_selection_list])
    return subsequence_array

def interpolate_subsequence(subsequence_dataset,feature_selection_list,seq_len):
    subsequence_dict = {}
    subsequence_dataset.drop(['tradeId', 'Date'], axis = 1)
    for subsequence_id in subsequence_dataset['sequence_id'].drop_duplicates():
        chunk = subsequence_dataset[subsequence_dataset['sequence_id'] == subsequence_id]
        if len(chunk) < seq_len:
            chunk_interpolated = pd.DataFrame(index = [x for x in range(seq_len)])
            chunk_interpolated['sequence_id'] = subsequence_id               
            chunk_interpolated['trade_data'] = chunk['trade_date'].drop_duplicates()[0]
#             chunk_interpolated['trader'] = chunk['trader'].drop_duplicates()[0]   
 
            #Interpolate all continuous valued variables using a Cubic Hermite Spline
            for feature in feature_selection_list: #select all continuous valued variables
                original_feature_data = pd.DataFrame(chunk[feature]).reset_index(drop = True)            
                #Create the required length df by creating an index the same length as the original data but that is sampled at a higher rate
                temp_df = pd.DataFrame(data = {feature:np.nan, 'interpolated_indicator': 1}, index = [(x*(len(chunk)-1)/seq_len) for x in range(seq_len)])
                temp_df_concat = pd.concat([original_feature_data, temp_df], axis = 0, sort = True).sort_index(axis = 0)
                #Interpolate data points using monotone cubic interpolation -
                    #Preserves the shape of the time series, monotonicity ensures that new local minima/maxima are not introduced
                temp_df_concat[feature] = temp_df_concat[feature].interpolate(method = 'slinear')
                chunk_interpolated[feature] = temp_df_concat[feature][temp_df_concat['interpolated_indicator'] == 1].reset_index(drop = True)  
                
            subsequence_dict[subsequence_id] = chunk_interpolated
        else:
            subsequence_dict[subsequence_id] = chunk
    if subsequence_dict:       
        subsequence_array = pd.concat(subsequence_dict.values(), axis = 0)
    else:
        subsequence_array = pd.DataFrame(columns  = ['sequence_id', 'trade_date', 'trader'] + feature_selection_list)
    return subsequence_array
 

def spark_create_subseq_dataset(feature_data, feature_selection_list, seq_len):
    """Generates a dataframe by subseq_id with sequences of data.
    These are to be converted to numpy arrays and then dumped into 
    multivariate dtw algo."""
    
    subsequence_dict={}
    smaller_seq_count=0
    raw_total_sequences=0
    index_reference=0
    #Number to increment data by
    step_size=3
    #Create list of subseq_ids to filter by 
    print("starting sequencing")
    sids = [i[0] for i in spark.sql("SELECT DISTINCT sequence_id FROM ScaledTable").collect()]
    feature_data = feature_data.withColumn('id',monotonically_increasing_id())
    feature_data.createOrReplaceTempView('ST')
    #Filter to dataset required, using SQL -> Catalyst engine more efficient
    for sid in sids:
        sequence_num=0
        sql_query = "SELECT * FROM ST WHERE sequence_id="+"'"+str(sid)+"'"
#         feature_data = feature_data.filter(col('sequence_id')=sid) 
        chunk = spark.sql(sql_query)
        chunk.createOrReplaceTempView("subseq")
        print("created subseq table")
        if chunk.count()>seq_len:
            i=0
            while i*step_size + seq_len <= chunk.count():
                if i==0:
                    step_query="SELECT * FROM subseq WHERE id BETWEEN "+"'0'"+"AND "+"'"+str((i*step_size)+seq_len)+"'"
                    subsequence_chunk = spark.sql(step_query)
                    print("creating first sequence...")
                else:
                    step_query="SELECT * FROM subseq WHERE id BETWEEN "+"'"+str(i)+"'"+"AND"+"'"+str((i*step_size)+seq_len)+"'"
                    print(f"creating sequence number: {i}")
                subsequence_chunk = spark.sql(step_query)
                subsequence_chunk = subsequence_chunk.withColumn('subsequence_id',lit(index_reference))
                subsequence_chunk=subsequence_chunk.orderBy('trade_date')
                cols = ['trade_date','subsequence_id']+feature_selection_list
                subsequence_chunk = subsequence_chunk.select(cols)
                subsequence_dict[index_reference]=subsequence_chunk
                sequence_num+=1
                index_reference+=1
                i+=1
        else:
            if chunk.count()>5:
                print("sequence is smaller so create small seq")
                smaller_sequence_count+=1
                small_chunk = chunk
                small_chunk.withColumn('sub_subsequence_id',lit(index_reference))
                small_chunk.orderBy('trade_date')
                s_cols = ['trade_date','sub_subsequence_id']+feature_selection_list
                small_chunk = small_chunk.select(s_cols)
                subsequence_dict[index_reference] = small_chunk
                sequence_num +=1
                index_reference+=1
            raw_total_sequences+=1
    if subsequence_dict:
        subsequence_array = pd.concat(subsequence_dict,axis=0)
    else:
        subsequence_array = pd.DataFrame(columns = ['trade_date','sequence_id'] + [x for x in feature_selection_list])
    return subsequence_dict


# df = spark.read.csv('/kaggle/input/forex-eurusd-dataset/dataset01_eurusd4h.csv', inferSchema=True,header=True)

#Write this data to kaggle

Collecting sparkmagic
  Downloading https://files.pythonhosted.org/packages/09/d4/468941ef9a48c35c7a870d868996e84f89ad0ef9178f46abcba111a25fad/sparkmagic-0.17.0.tar.gz
Collecting hdijupyterutils>=0.6
  Downloading https://files.pythonhosted.org/packages/f2/eb/4afdfdfcc1c53b2fec43126b13cf4e41a8ab60d80995626a75b8a94787a2/hdijupyterutils-0.17.0.tar.gz
Collecting autovizwidget>=0.6
  Downloading https://files.pythonhosted.org/packages/1b/2b/ca743774da5ac18061dac161c252e2021a02dd0aec26ea388e96be0d66d7/autovizwidget-0.17.0.tar.gz
Collecting nose
[?25l  Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)
[K     |████████████████████████████████| 163kB 4.9MB/s 
[?25hCollecting mock
  Downloading https://files.pythonhosted.org/packages/cd/74/d72daf8dff5b6566db857cfd088907bb0355f5dd2914c4b3ef065c790735/mock-4.0.2-py3-none-any.whl
Collecting requests_kerberos>=0.8.0
  Downloading https://file

In [None]:
import string
import random
#Create random_string for tradeId
def str_gen(length):
    stringlist=[]
    num = []
    letters = string.ascii_letters
    for i in range(length):
        stringlist.append(random.choice(letters))
        num.append(str(randint(0,9)))
    word = ''.join(stringlist)
    num = ''.join(num)
    word = word+'-'+num[:2]
    return word

s_dt = datetime(2020, 10, 1)-timedelta(hours=17912)
end_dt = datetime(2020,10,1)
step = timedelta(seconds=randint(30,150))
dts = []
while s_dt<=end_dt:
    dts.append(s_dt.strftime('%Y-%m-%d %H:%M:%S'))
    s_dt+=step
# volume = [randint(1000,10000000) for num in range(df.count())]
#Seed the number for random generator
seed(1)
#Generate example dataset
base_dict = {}
for num in range(1,len(dts)):
    base_dict[num] = {'participation_rate':random.random(),
                      'Date':dts[num],
                      'Volume':randint(1000,10000000)*(randint(0,9)+randint(0,9)),
                     'Strike':uniform(0.9875,1.12345),
                     'tradeId': str_gen(7)
                     }
    if num % 10000==0:
        print(f"{num} rows created")
    else:
        continue
    
data_as_rows = [Row(**{'row_id': k, **v}) for k,v in base_dict.items()]

from pyspark.sql.types import StructType, StructField, StringType
#Define schema to prevent ValueError
# schema = StructType([StructField("foo", StringType(), True), StructFie])
schema = 'row_id INTEGER, participation_rate STRING, Date STRING, Volume INTEGER, Strike DOUBLE, tradeId STRING'
b = spark.createDataFrame(data_as_rows,schema=schema)
# b.to_csv('testdata.csv')
# b = spark.read.csv('./testdata.csv/*.csv',header=True)
#Create monotonically increasing ids
# df = df.withColumn('row_id',row_number().over(Window.orderBy(monotonically_increasing_id())))
# Join dfs
b = b.withColumn('CCYPAIR',lit('EURUSD'))
b = b.withColumn('trade_date',to_date(col('Date')))
#Create window
#Then denserank, we are assuming working with one traders worth of data
subspec = Window.partitionBy('CCYPAIR').orderBy('trade_date')
b = b.withColumn('sequence_id',dense_rank().over(subspec))
#Scale dataset
features = b.columns
assemblers = [VectorAssembler(inputCols=[col], outputCol=col+"_vec") for col in features]
scalers = [MinMaxScaler(inputCol=col+"_vec",outputCol=col+"_scaled") for col in features]
#Convert all columns to float
float_df = b.select(*(col(c).cast("float").alias(c) for c in b.columns)).fillna(0)

pipeline = Pipeline(stages=assemblers+scalers)
model = pipeline.fit(float_df)
scaled_df =model.transform(float_df)
#Only select scaled columns
scaledData = scaled_df.select([col for col in scaled_df.columns if '_scaled' in col])
first_element=udf((lambda x:x[0]),FloatType())
scaledData=scaledData.select([first_element(col) for col in scaledData.columns])
#Rename columns
r_scaled_df = reduce(lambda scaledData,idx: scaledData.withColumnRenamed(scaledData.columns[idx],features[idx]),range(len(scaledData.columns)),float_df)

#Merge datasets to bring back non float data
drop_cols = ['Date','trade_date','CCYPAIR']
#Drop string cols
r_scaled_df = reduce(DataFrame.drop,drop_cols,r_scaled_df)


10000 rows created
20000 rows created
30000 rows created
40000 rows created
50000 rows created
60000 rows created
70000 rows created
80000 rows created
90000 rows created
100000 rows created
110000 rows created
120000 rows created
130000 rows created
140000 rows created
150000 rows created
160000 rows created
170000 rows created
180000 rows created
190000 rows created
200000 rows created
210000 rows created
220000 rows created
230000 rows created
240000 rows created
250000 rows created
260000 rows created
270000 rows created
280000 rows created
290000 rows created
300000 rows created
310000 rows created
320000 rows created
330000 rows created
340000 rows created
350000 rows created
360000 rows created
370000 rows created
380000 rows created
390000 rows created
400000 rows created
410000 rows created
420000 rows created
430000 rows created
440000 rows created


In [None]:
def interpolate_subsequence(subsequence_dataset,feature_selection_list,seq_len):
    subsequence_dict = {}
    subsequence_dataset.drop(['tradeId', 'Date'], axis = 1)
    for subsequence_id in subsequence_dataset['sequence_id'].drop_duplicates():
        chunk = subsequence_dataset[subsequence_dataset['sequence_id'] == subsequence_id]
        if len(chunk) < seq_len:
            chunk_interpolated = pd.DataFrame(index = [x for x in range(seq_len)])
            chunk_interpolated['sequence_id'] = subsequence_id               
            chunk_interpolated['trade_date'] = chunk['trade_date'].drop_duplicates()
#             chunk_interpolated['trader'] = chunk['trader'].drop_duplicates()[0]   
 
            #Interpolate all continuous valued variables using a Cubic Hermite Spline
            for feature in feature_selection_list: #select all continuous valued variables
                original_feature_data = pd.DataFrame(chunk[feature]).reset_index(drop = True)            
                #Create the required length df by creating an index the same length as the original data but that is sampled at a higher rate
                temp_df = pd.DataFrame(data = {feature:np.nan, 'interpolated_indicator': 1}, index = [(x*(len(chunk)-1)/seq_len) for x in range(seq_len)])
                temp_df_concat = pd.concat([original_feature_data, temp_df], axis = 0, sort = True).sort_index(axis = 0)
                #Interpolate data points using monotone cubic interpolation -
                    #Preserves the shape of the time series, monotonicity ensures that new local minima/maxima are not introduced
                temp_df_concat[feature] = temp_df_concat[feature].interpolate(method = 'slinear')
                chunk_interpolated[feature] = temp_df_concat[feature][temp_df_concat['interpolated_indicator'] == 1].reset_index(drop = True)  
                
            subsequence_dict[subsequence_id] = chunk_interpolated
        else:
            subsequence_dict[subsequence_id] = chunk
    if subsequence_dict:       
        subsequence_array = pd.concat(subsequence_dict.values(), axis = 0)
    else:
        subsequence_array = pd.DataFrame(columns  = ['sequence_id', 'trade_date', 'trader'] + feature_selection_list)
    return subsequence_array

final_df = r_scaled_df.join(b.select('row_id','CCYPAIR','trade_date','Date','tradeId'),'row_id',how='inner').drop(r_scaled_df.tradeId)
#Run subsequencing on scaled_Df
#Generate subsequence data
subseq_window = Window.partitionBy('sequence_id').orderBy('row_id')
final_df = final_df.withColumn('subseq_id',dense_rank().over(subseq_window))
p_df = final_df.toPandas()
#Interpolate data
interp_df = interpolate_subsequence(p_df,features,100)


In [None]:
#Installing dependencies for Rpy2
import subprocess
subprocess.run('conda install -c conda-forge r-base', shell=True)

CompletedProcess(args='conda install -c conda-forge r-base', returncode=127)

In [None]:
# using numba to speed up function
# from numba import jit
!pip3 install rpy2
!pip install tslearn
# os.environ
import time
def trade_dict_gen(df):
    start_time = time.perf_counter()
    trade_dict = {}
    #Filter to sequence_id
    ids = [i for i in df['sequence_id'].drop_duplicates()]
    #Drop all data and only keep continuous variables
    cont_df = df.drop(columns=['row_id','CCYPAIR','trade_date','Date','tradeId','subseq_id'])
    for i in ids:
        #Converts df into numpy array
        trade_dict[i] = cont_df[cont_df.sequence_id == i].values
    print(f"Time elapsed is {time.perf_counter() - start_time}")
    return trade_dict

trade_dict = trade_dict_gen(interp_df)

# !python -m rpy2.situation
import numpy as np
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
from rpy2.robjects.packages import importr
utils = robjects.packages.importr("utils")
package_name = "dtw"
utils.install_packages(package_name)

R = robjects.r

DTW = importr('dtw')

def DTW(s1,s2,window):
    R = rpy2.robjects.r
    DTW = importr('dtw')
    R.dtw = SignatureTranslatedFunction(R.dtw,
                        init_prm_translate={'window_size': 'window.size'})
    rt,ct=s1.shape
    rq,cq = s2.shape
    templateR=R.matrix(s1,nrow=rt,ncol=ct)
    queryR=R.matrix(s2,nrow=rq,ncol=cq)
    alignment = R.dtw(templateR,queryR,keep=True, step_pattern=R.rabinerJuangStepPattern(4,"c"),open_begin=True,open_end=True)
    dist = alignment.rx('distance')[0][0]
    return dist

from tslearn import metrics
# from numba import jit
# @jit
def compute_manipulation_scoresv2(t_dict):
#     for each series compare to all other series then average it
    
    #create a list of all other 
    #Compare to all other sequences
    seq_ids = [i for i in t_dict.keys()]
    print('Function started')
    start_time = time.perf_counter()
    for num in range(1,len(t_dict)):
        print(f"Comparing sequence number: {num}")
        df_list=[seq for seq in seq_ids]
        del df_list[num]
        for idx in df_list:
            dtw_dists = []
            dtw_dists.append(metrics.dtw(t_dict[num],t_dict[idx]))
            t_dict[num]=np.mean(dtw_dists)
    print(f"Time elapsed is {time.perf_counter() - start_time}")
    return t_dict

dist_scores = compute_manipulation_scoresv2(trade_dict)

# Map values to dataframe

Collecting tslearn
[?25l  Downloading https://files.pythonhosted.org/packages/a7/67/aa3149fdfef2582d881ce4a5117c9e6a465d5082dd57866904ca508a157c/tslearn-0.4.1-cp36-cp36m-manylinux2010_x86_64.whl (770kB)
[K     |████████████████████████████████| 778kB 3.7MB/s 
Installing collected packages: tslearn
Successfully installed tslearn-0.4.1
Time elapsed is 1.3175112900000272


R[write to console]: Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

R[write to console]: also installing the dependency ‘proxy’


R[write to console]: trying URL 'https://cran.rstudio.com/src/contrib/proxy_0.4-24.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 115932 bytes (113 KB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to con

Function started
Comparing sequence number: 1
Comparing sequence number: 2
Comparing sequence number: 3
Comparing sequence number: 4
Comparing sequence number: 5
Comparing sequence number: 6
Comparing sequence number: 7
Comparing sequence number: 8
Comparing sequence number: 9
Comparing sequence number: 10
Comparing sequence number: 11
Comparing sequence number: 12
Comparing sequence number: 13
Comparing sequence number: 14
Comparing sequence number: 15
Comparing sequence number: 16
Comparing sequence number: 17
Comparing sequence number: 18
Comparing sequence number: 19
Comparing sequence number: 20
Comparing sequence number: 21
Comparing sequence number: 22
Comparing sequence number: 23
Comparing sequence number: 24
Comparing sequence number: 25
Comparing sequence number: 26
Comparing sequence number: 27
Comparing sequence number: 28
Comparing sequence number: 29
Comparing sequence number: 30
Comparing sequence number: 31
Comparing sequence number: 32
Comparing sequence number: 33
Co

In [None]:
dist_scorev2 = {}
# if infinity then remove

for k,v in dist_scores.items():
    dist_scorev2[k] = np.mean(v)
    if dist_scorev2[k] > 1E308: 
        del dist_scorev2[k]
    else:
        continue 

#Plot data on histogram
# import matplotlib.pyplot as plt
# dist_vals = dist_scores.values()
# interp_df = interp_df.replace([np.inf, -np.inf],np.nan)

# plt.hist(dist_scores.values())
# dist_scores.values().dropna()
dist_scores = dist_scorev2

In [None]:
interp_df['dist_scores'] = interp_df['row_id'].map(dist_scores)
display(dist_scores)
interp_df.head()

{13.0: 326.2995735197645,
 28.0: 1.6042574310684763e+105,
 37.0: 8.374221875737821e+46,
 60.0: 1.2619348610690474e+94,
 61.0: 1.2619348610690474e+94,
 74.0: 4659012.097830499,
 75.0: 6.098214223282816e+17,
 111.0: 2.1173403709211906e+91,
 114.0: 6.743612981069505e+146,
 125.0: 6.743612981069505e+146,
 131.0: 6.743612981069505e+146,
 135.0: 6.743612981069505e+146,
 153.0: 14.352191348376829,
 155.0: 2.7622915068593546e+145,
 184.0: 2.7286695279739906e+34,
 208.0: 7.276884221650112e+53,
 212.0: 1.4551953175013842e+87,
 213.0: 1017346529681881.8,
 218.0: 1.940617705164745e+73,
 237.0: 3.185325428481723e+138,
 252.0: 1.5941995504834003e+105,
 280.0: 1.9053949144445088e+141,
 297.0: 1.9053949144445088e+141,
 323.0: 1.1158419530274029e+33,
 334.0: 38905428678645.04,
 338.0: 3.256069975108627e+70,
 352.0: 65103254851.82624,
 373.0: 1.3337382295263114e+69,
 401.0: 2.1973493803891645e+134,
 433.0: 3.431480700469133e+45,
 443.0: 9.000693764737608e+132,
 502.0: 3.550878643338005e+88,
 508.0: 3.55

Unnamed: 0,row_id,participation_rate,Volume,Strike,sequence_id,CCYPAIR,trade_date,Date,tradeId,subseq_id,dist_scores
0,424453.0,0.275216,907718.0,1.079587,714.0,EURUSD,2020-08-28,2020-08-28 00:01:25,nwfOErP-04,1,
1,424454.0,0.799262,23841916.0,1.110936,714.0,EURUSD,2020-08-28,2020-08-28 00:03:50,fDOhkMs-39,2,
2,424455.0,0.021166,35412792.0,1.04711,714.0,EURUSD,2020-08-28,2020-08-28 00:06:15,NtqaiMM-38,3,
3,424456.0,0.233372,23671732.0,0.990246,714.0,EURUSD,2020-08-28,2020-08-28 00:08:40,FgdrGIq-69,4,
4,424457.0,0.33901,60632800.0,1.031385,714.0,EURUSD,2020-08-28,2020-08-28 00:11:05,pXQWVph-56,5,


In [None]:
# dist_vals[dist_vals < 1E308]
# Plot the scores available
import plotly.express as px
# minds = min(dist_scores.values())
# maxds = np.max(np.array(dist_scores.values()))
# minds = min(dist_scores.keys(), key=(lambda k: dist_scores[k]))
# binnums = (maxds-minds)/len(dist_scores)

fig = px.histogram(dist_scores.values(),
#                    bins=range(maxds,minds,binnums),
                   histnorm='probability',
                   title='Histogram of DTW Distances',
                   labels={'value':'DTW distances'},
                  opacity=0.8,
                   log_y=True, # represent bars with log scale
                   color_discrete_sequence=['indianred'])
fig.show()
# dist_scorev2