# Model Validation

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
import time
import queue as Q

In [2]:
# load the data 
data1 = pd.read_csv("data/TimetxinBlock621500.csv")
data1.columns=['index','inputs','outputs','trans_version','trans_size','trans_weight','received_time','relay_node','locktime','trans_fee','confirmed_block_height','index_block_height','confirm_time','waiting_time','feerate','enter_block_height','waiting_block_num','valid_time','valid_block_height','valid_waiting','last_block_interval']
data2 = pd.read_csv("data/TimetxinBlock622000.csv")
data2.columns=['index','inputs','outputs','trans_version','trans_size','trans_weight','received_time','relay_node','locktime','trans_fee','confirmed_block_height','index_block_height','confirm_time','waiting_time','feerate','enter_block_height','waiting_block_num','valid_time','valid_block_height','valid_waiting','last_block_interval']
data3 = pd.read_csv("data/TimetxinBlock622500.csv")
data3.columns=['index','inputs','outputs','trans_version','trans_size','trans_weight','received_time','relay_node','locktime','trans_fee','confirmed_block_height','index_block_height','confirm_time','waiting_time','feerate','enter_block_height','waiting_block_num','valid_time','valid_block_height','valid_waiting','last_block_interval']
# combine the data into one dataframe
data = pd.concat([data1, data2, data3], ignore_index = True)
# data1.columns=['index','inputs','outputs','trans_version','trans_size','trans_weight','received_time','relay_node','locktime','trans_fee','confirmed_block_height','index_block_height','confirm_time','waiting_time','feerate','enter_block_height','waiting_block_num','valid_time','valid_block_height','valid_waiting','last_block_interval']
print(data.shape)
data.head(10)

(3389593, 21)


Unnamed: 0,index,inputs,outputs,trans_version,trans_size,trans_weight,received_time,relay_node,locktime,trans_fee,...,index_block_height,confirm_time,waiting_time,feerate,enter_block_height,waiting_block_num,valid_time,valid_block_height,valid_waiting,last_block_interval
0,0,1,2,1,223,892,1583791819,0,0,2486,...,0,1583794214,2395,11.147982,620998,4,1583792000.0,620998,2395.0,908.0
1,0,1,2,1,223,892,1583793885,0,0,2486,...,0,1583794214,329,11.147982,621001,1,1583794000.0,621001,329.0,119.0
2,0,1,1,1,223,892,1583793700,0,0,2486,...,0,1583794214,514,11.147982,621000,2,1583794000.0,621000,514.0,1376.0
3,0,1,2,1,223,892,1583791354,0,0,2486,...,0,1583794214,2860,11.147982,620998,4,1583791000.0,620998,2860.0,443.0
4,0,1,2,1,223,892,1583791731,0,0,2486,...,0,1583794214,2483,11.147982,620998,4,1583792000.0,620998,2483.0,820.0
5,0,2,2,1,371,1484,1583792024,0,0,4136,...,0,1583794214,2190,11.148248,620998,4,1583792000.0,620998,2190.0,1113.0
6,0,2,2,1,371,1484,1583793603,0,0,4136,...,0,1583794214,611,11.148248,621000,2,1583794000.0,621000,611.0,1279.0
7,0,2,2,1,371,1484,1583793883,0,0,4136,...,0,1583794214,331,11.148248,621001,1,1583794000.0,621001,331.0,117.0
8,0,2,2,1,371,1484,1583791921,0,0,4136,...,0,1583794214,2293,11.148248,620998,4,1583792000.0,620998,2293.0,1010.0
9,0,3,2,1,518,2072,1583793502,0,0,5775,...,0,1583794214,712,11.148649,621000,2,1583794000.0,621000,712.0,1178.0


# Model Inputs

In [21]:
# Build sample model data/output
COLS_TO_USE = [4,5,6,9,10,12,13,14,15,16]
HEADER_NAMES = ['size','weight','recieved_time','fee','confirmed_block_height','confirm_time','waiting_time','fee_rate','enter_block_height','no_block_confirm']

d1 = pd.read_csv('data/TimetxinBlock621500.csv', usecols=COLS_TO_USE, names=HEADER_NAMES)
d2 = pd.read_csv('data/TimetxinBlock622000.csv', usecols=COLS_TO_USE, names=HEADER_NAMES)
d3 = pd.read_csv('data/TimetxinBlock622500.csv', usecols=COLS_TO_USE, names=HEADER_NAMES)
dataFrame = pd.concat([d1, d2, d3])

transaction_count = len(dataFrame)

dataFrame.head(20)
P_GROUP_BOUNDS = [1,3,10]
block_confirm_max = dataFrame['no_block_confirm'].max()

df_p1 = dataFrame[dataFrame['no_block_confirm'].between(0, P_GROUP_BOUNDS[0])]
df_p2 = dataFrame[dataFrame['no_block_confirm'].between((P_GROUP_BOUNDS[0]+1), P_GROUP_BOUNDS[1])]
df_p3 = dataFrame[dataFrame['no_block_confirm'].between((P_GROUP_BOUNDS[1]+1), P_GROUP_BOUNDS[2])]
df_p4 = dataFrame[dataFrame['no_block_confirm'].between((P_GROUP_BOUNDS[2]+1), block_confirm_max)]

transaction_count_p1 = len(df_p1)
transaction_count_p2 = len(df_p2)
transaction_count_p3 = len(df_p3)
transaction_count_p4 = len(df_p4)

lambda_p1 = float(transaction_count_p1) / (float(df_p1['recieved_time'].max()) - float(df_p1['recieved_time'].min()))
#print("lambda_p1: ", lambda_p1)
lambda_p2 = float(transaction_count_p2) / (float(df_p2['recieved_time'].max()) - float(df_p2['recieved_time'].min()))
#print("lambda_p2: ", lambda_p2)
lambda_p3 = float(transaction_count_p3) / (float(df_p3['recieved_time'].max()) - float(df_p3['recieved_time'].min()))
#print("lambda_p3: ", lambda_p3)
lambda_p4 = float(transaction_count_p4) / (float(df_p4['recieved_time'].max()) - float(df_p4['recieved_time'].min()))
#print("lambda_p4: ", lambda_p4)

service_time = 600
mu = 1/service_time
#print("Mu: ", mu)

block_groups = dataFrame.groupby(['confirmed_block_height'])['confirmed_block_height'].count()
mean_block_size = float(round(block_groups.mean()))
#print("Mean Block Size: ", mean_block_size)

# Model Output/Historical Data Comparison

## Transaction confirmation times

In [23]:
# Purpose: provide an overview of the transaction's accuracy.
# Average transaction times overall

pd.set_option('display.float_format', lambda x: '%.5f' % x)
print(data["waiting_time"].describe())
# potentially strip very high values?
# Average transaction time distribution(?)
# data["waiting_time"].plot.hist()

count   3389593.00000
mean       5059.36709
std       15020.12713
min         -10.00000
25%         291.00000
50%         764.00000
75%        2262.00000
max      252015.00000
Name: waiting_time, dtype: float64


## Split by date ranges

In [5]:
# Purpose: investigate fluctuation over time, but also seasonal fluctuation, and whether or not the model can capture that.
# Averages per month

# Averages per month visualised



## Priority category

This section compares the mean wait times of each priority category of the historical data, to the output of the model.

There are four priority categories. These are configurable, and the exact details of these can be seen in the Model Inputs section.

In [20]:
# Purpose: see how accurately the model can capture the behaviour of transactions, based on transaction-priority.
# Print data split into queues (above in model loading section)
print("Historical Data: ")
print("Wait Times P1: ", df_p1["waiting_time"].mean())
print("Wait Times P2: ", df_p2["waiting_time"].mean())
print("Wait Times P3: ", df_p3["waiting_time"].mean())
print("Wait Times P4: ", df_p4["waiting_time"].mean())

# From "QueueWaitTimes.ipynb"

print("\nModel Output: ")

current_lambda = 0.0

def NewtonMethod(lam, m_u, block_size, x0, epsilon, max_iteration):

    fx = lambda x: lam*(1 - x) - m_u*x*(1 - x**block_size)
    dfx = lambda x: m_u*(block_size*x**block_size + x**block_size - 1) - lam

    xn = x0
    for n in range(0, max_iteration):
        fxn = fx(xn)
        if abs(fxn) < epsilon:
            return xn
        
        dfxn = dfx(xn)
        if dfxn == 0:
            return None

        xn = xn - fxn/dfxn

    return None

z1 = NewtonMethod(lambda_p1, mu, mean_block_size, 0, 1e-10, 500)
L1 = z1 / (1 - z1)
W1 = L1 / lambda_p1
#print("z1: ", z1)
#print("L1: ", L1)
print("Wait Times P1: ", W1)

z2 = NewtonMethod((lambda_p1+lambda_p2), mu, mean_block_size, 0, 1e-10, 500)
L2 = (z2 / (1 - z2)) - L1
W2 = L2 / lambda_p2
#print("z2: ", z2)
#print("L2: ", L2)
print("Wait Times P2: ", W2)

z3 = NewtonMethod((lambda_p1+lambda_p2+lambda_p3), mu, mean_block_size, 0, 1e-10, 500)
L3 = (z3 / (1 - z3)) - L2
W3 = L3 / lambda_p3
#print("z3: ", z3)
#print("L3: ", L3)
print("Wait Times P3: ", W3)

z4 = NewtonMethod((lambda_p1+lambda_p2+lambda_p3+lambda_p4), mu, mean_block_size, 0, 1e-10, 500)
L4 = (z4 / (1 - z4)) - L3
W4 = L4 / lambda_p4
#print("z4: ", z4)
#print("L4: ", L4)
print("Wait Times P4: ", W4)

Historical Data: 
Wait Times P1:  561.6191699181529
Wait Times P2:  1484.457933865015
Wait Times P3:  4118.161191411425
Wait Times P4:  33988.67494356094

Model Output: 
Wait Times P1:  760.9978599203225
Wait Times P2:  1791.881560837535
Wait Times P3:  7386.068569776693
Wait Times P4:  10155.210829302845


## High/low traffic load analysis

In [None]:
# Purpose: investigate how well the model can capture periods of high/low traffic.

# Model Validation Metrics

## 1. Monte-Carlo simulation analysis

## Simulation Model Inputs:

In [25]:
# lambda calculation - arrival rate 
total_trans_num = len(data.index)
print("Total number of transactions: ", total_trans_num)

data_sort_by_receive = data.sort_values(by='received_time')
first_arrive_time=float(data['received_time'].min())
last_arrive_time=float(data['received_time'].max())
q_lambda=float(total_trans_num)/(last_arrive_time-first_arrive_time)
mean_interarrival_time=1/q_lambda
print("lambda is: ", q_lambda)

# mu - service rate
service_time_mean=600
mu = 1/service_time_mean
print("Mu is: ", mu)

# total time (seconds)
simulation_time = 6000
print("Total simulation time is: ", simulation_time)

# mempool capacity
trans_size=535
mempool_size=300*1000000
capacity = mempool_size/trans_size
print("Mempool capacity is ", capacity, " transactions")

# block size 
b=1960
print("Block size is: ", b)

# store feerates in a list to feed in the model later 
feerates=data.feerate.tolist()

# initiate results 
results=[]

# This function is to calcutate the feerate bondary of each priority group 

WAIT_BLOCK_NUM_UPPER_BOUNDARY = [1,3,10]

length=len(WAIT_BLOCK_NUM_UPPER_BOUNDARY)

# this function return a list with the feerate boundaries for each priority group
def calculate_feerate_for_priority_groups(data, upper_boundary):
    feerate_range = []
    transaction_counts=0
    for i in range(length):
        transaction_counts = len(data[data['waiting_block_num']<=upper_boundary[i]])
        boundary = data.feerate.nlargest(transaction_counts).iloc[-1]
        feerate_range.append(boundary)
    return feerate_range
            

feerate_range=calculate_feerate_for_priority_groups(data, WAIT_BLOCK_NUM_UPPER_BOUNDARY)
print("one block waited ration: ",feerate_range)

Total number of transactions:  3389593
lambda is:  3.083289746694162
Mu is:  0.0016666666666666668
Total simulation time is:  6000
Mempool capacity is  560747.6635514018  transactions
Block size is:  1960
one block waited ration:  [30.17714002968827, 18.529411764705888, 5.0]


## Simulation Outputs / Comparison

In [34]:
from collections import deque
from collections import namedtuple as nt

# initiate variables 
transaction=nt('transaction',['received_time','feerate','confirmed_time','arrived_after_block','confirmed_block'])

q1=deque()
q2=deque()
q3=deque()
q4=deque()

t=round(time.time())
next_arrival=t
next_confirm=next_arrival+round(np.random.exponential(service_time_mean))
block_index=0


## place holder for checking mempool size
# allocate transaction to different priority groups and populating the arrival time  

i=0    
while next_confirm <= t + simulation_time:
    while next_arrival <= next_confirm:  # transactions come in before the next batch
        feerate = feerates[i]
        if feerate>=feerate_range[0]:  
            q1.append(transaction(next_arrival,feerate,0,block_index,0))
        elif feerate>=feerate_range[1]:
            q2.append(transaction(next_arrival,feerate,0,block_index,0))
        elif feerate>=feerate_range[1]:
            q3.append(transaction(next_arrival,feerate,0,block_index,0))
        else:
            q4.append(transaction(next_arrival,feerate,0,block_index,0))        
        next_arrival += round(np.random.exponential(1/q_lambda))
        i+=1
        
    # batch processing with size b
    j=0
    block_index = block_index + 1
    while q1 and j<=b:
        # - using popleft() to delete element from left end 
        q1[0] = q1[0]._replace(confirmed_time=next_confirm, confirmed_block=block_index)
        # q1[0] = q1[0]._replace(confirmed_block=block_index)
        confirmed = q1.popleft()
        results.append(confirmed)
        j+=1
    while q2 and j<=b:
        q2[0] = q2[0]._replace(confirmed_time=next_confirm)
        q2[0] = q2[0]._replace(confirmed_block=block_index)
        confirmed = q2.popleft()
        results.append(confirmed)
        j+=1
    while q3 and j<=b:
        q3[0] = q3[0]._replace(confirmed_time=next_confirm)
        q3[0] = q3[0]._replace(confirmed_block=block_index)
        confirmed = q3.popleft()
        results.append(confirmed)
        j+=1
    while q4 and j<=b:
        q4[0] = q4[0]._replace(confirmed_time=next_confirm)
        q4[0] = q4[0]._replace(confirmed_block=block_index)
        confirmed = q4.popleft()
        results.append(confirmed)
        j+=1
        
    # calculate next confirmation time 
    next_confirm += round(np.random.exponential(service_time_mean))

       
results_df=pd.DataFrame(results,columns=transaction._fields)
results_df=results_df.apply(pd.to_numeric)
results_df["waiting_time"] = results_df["confirmed_time"] - results_df["received_time"]
results_df.head(10)

sim_p1 = results_df[results_df['confirmed_block'].between(0, P_GROUP_BOUNDS[0])]
sim_p2 = results_df[results_df['confirmed_block'].between((P_GROUP_BOUNDS[0]+1), P_GROUP_BOUNDS[1])]
sim_p3 = results_df[results_df['confirmed_block'].between((P_GROUP_BOUNDS[1]+1), P_GROUP_BOUNDS[2])]
sim_p4 = results_df[results_df['confirmed_block'].between((P_GROUP_BOUNDS[2]+1), block_confirm_max)]

print("Historical Data: ")
print("Wait Times P1: ", df_p1["waiting_time"].mean())
print("Wait Times P2: ", df_p2["waiting_time"].mean())
print("Wait Times P3: ", df_p3["waiting_time"].mean())
print("Wait Times P4: ", df_p4["waiting_time"].mean())

print("\nSimulation Output: ")
print("Wait Times P1: ", sim_p1["waiting_time"].mean())
print("Wait Times P2: ", sim_p2["waiting_time"].mean())
print("Wait Times P3: ", sim_p3["waiting_time"].mean())
print("Wait Times P4: ", sim_p4["waiting_time"].mean())

Historical Data: 
Wait Times P1:  561.6191699181529
Wait Times P2:  1484.457933865015
Wait Times P3:  4118.161191411425
Wait Times P4:  33988.67494356094

Simulation Output: 
Wait Times P1:  263.1910529187125
Wait Times P2:  454.24315476190475
Wait Times P3:  1179.7699073486483
Wait Times P4:  804.5326135852451


## 2. Root Mean Square Error

In [None]:
# (...)

rmse = mean_squared_error(actual, predicted, squared = False)

# I need to figure out:
# 1. How to compare the output of the model to the historical data in this way? 



## 3. anything else? GoF? R-Squared?