In [1]:
import clickhouse_connect
import pandas as pd
import polars as pl

pl.Config.set_fmt_str_lengths(200)
pl.Config.set_fmt_float("full")

polars.config.Config

### Direct connect to Cryptohouse
Connect to cryptohouse with client to do some initial querying and result aggregation.


### Limitations
- Can only do 1000 blocks, the direct access limits are still being figured out.


```
[10:08]KemarTiti 🔮: Some of these transactions need to land sequentially

when we use Jito we put them all in the same bundle, since Jito bundles enforce the ordering, so they land in the right order

but when i looked at your data, they had were all landing (even in the same slot) but in the wrong order

you need to make sure
init vaa -> write vaa -> post price update -> close vaa lands in this order

The naive way of accomplishing this is sending the first, then wait for confirmation, then send the second etc..

Another way is Jito bundle
```

In [2]:
# Create a connection to the ClickHouse server
client = clickhouse_connect.get_client(
    host='crypto-clickhouse.clickhouse.com',
    username='crypto',
    password='',
    secure=True
)

In [15]:
# Execute the query and return as a pandas dataframe
query: str = """
SELECT
  *
FROM
  solana.transactions ARRAY
  JOIN accounts AS account
WHERE
  arrayExists(
    x -> x.1 IN [ 
        'Fc8bpeCMifWYv97pQ3k5xDvd98nuVg6yAaZrwmy4RRp6', -- zeta dex address
        'pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT' -- pyth price feed
    ],
    accounts
  )
 // The block_slot range 282,300,000 to 282,900,000 is about 3 days and is a known range where probems were being experienced
 AND block_slot > 282899000
 AND block_slot < 282900000
  // AND block_timestamp >= now() - INTERVAL 5 MINUTE
  AND account.2 = true -- Filter for signer accounts only
"""

In [16]:
results: pd.DataFrame = client.query_df(query)

In [18]:
df = pl.from_pandas(results)

In [19]:
df.group_by('status', 'log_messages').agg(pl.len().alias('count')).sort(by='count', descending=True)

status,log_messages,count
str,list[str],u32
"""0""","[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD failed: custom program error: 0x1819""]",702
"""1""","[""Program 11111111111111111111111111111111 invoke [1]"", ""Program 11111111111111111111111111111111 success"", … ""Program ComputeBudget111111111111111111111111111111 success""]",410
"""0""","[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT failed: custom program error: 0x1777""]",134
"""0""","[""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]"", ""Program log: Instruction: WriteEncodedVaa"", … ""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ failed: custom program error: 0xbbb""]",82
"""1""","[""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]"", ""Program log: Instruction: CloseEncodedVaa"", … ""Program ComputeBudget111111111111111111111111111111 success""]",82
…,…,…
"""1""","[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"""1""","[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success""]",1
"""1""","[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"""1""","[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1


In [22]:
# blocks with highest error rates
df.group_by('block_slot', 'status').agg(pl.len().alias('count')).sort(by='count', descending=True).filter(pl.col('status') == "0").head(10)

block_slot,status,count
i64,str,u32
282899126,"""0""",127
282899186,"""0""",93
282899763,"""0""",71
282899519,"""0""",62
282899762,"""0""",44
282899963,"""0""",41
282899527,"""0""",37
282899038,"""0""",32
282899967,"""0""",31
282899127,"""0""",28


In [23]:
df.group_by('block_slot', 'status').agg(pl.len().alias('count')).sort(by='count', descending=True).filter(pl.col('status') == "1").head(10)

block_slot,status,count
i64,str,u32
282899246,"""1""",36
282899583,"""1""",27
282899082,"""1""",27
282899123,"""1""",24
282899186,"""1""",24
282899967,"""1""",21
282899435,"""1""",20
282899417,"""1""",20
282899219,"""1""",20
282899886,"""1""",20


### Groupby order of log messages and success rates per block_slot

In [32]:
# only has successes
high_sucess_block = df.filter(pl.col('block_slot') == 282899246).group_by('block_slot', 'status').agg(pl.len().alias('count')).sort(by='count', descending=True)

In [33]:
high_sucess_block

block_slot,status,count
i64,str,u32
282899246,"""1""",36


In [34]:
high_fail_block = df.filter(pl.col('block_slot') == 282899126).group_by('block_slot', 'status').agg(pl.len().alias('count')).sort(by='count', descending=True)
high_fail_block

block_slot,status,count
i64,str,u32
282899126,"""0""",127
282899126,"""1""",15


In [41]:
success_df = df.filter(pl.col('block_slot') == 282899246)

In [43]:
success_df.group_by('log_messages').agg(pl.len().alias('count')).sort(by='count', descending=True)

log_messages,count
list[str],u32
"[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success""]",1
"[""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]"", ""Program log: Instruction: WriteEncodedVaa"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success""]",1
"[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success""]",1
"[""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]"", ""Program log: Instruction: WriteEncodedVaa"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
…,…
"[""Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]"", ""Program log: Instruction: WriteEncodedVaa"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"[""Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]"", ""Program log: Instruction: UpdatePriceFeed"", … ""Program ComputeBudget111111111111111111111111111111 success""]",1
"[""Program ComputeBudget111111111111111111111111111111 invoke [1]"", ""Program ComputeBudget111111111111111111111111111111 success"", … ""Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success""]",1


In [44]:
success_df.select('index', 'log_messages').to_dicts()

[{'index': 209,
  'log_messages': ['Program ComputeBudget111111111111111111111111111111 invoke [1]',
   'Program ComputeBudget111111111111111111111111111111 success',
   'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD invoke [1]',
   'Program log: Instruction: UpdatePricingV2',
   'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD consumed 20398 of 199850 compute units',
   'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success']},
 {'index': 810,
  'log_messages': ['Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]',
   'Program log: Instruction: UpdatePriceFeed',
   'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ invoke [2]',
   'Program log: Instruction: PostUpdate',
   'Program 11111111111111111111111111111111 invoke [3]',
   'Program 11111111111111111111111111111111 success',
   'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ consumed 26779 of 68950 compute units',
   'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ success',
   'Program pyt

### Not sure if it makes sense to use count vectorizer..

In [48]:
from collections import defaultdict
from itertools import combinations
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample data
data = [
    {'index': 209, 'log_messages': [
        'Program ComputeBudget111111111111111111111111111111 invoke [1]',
        'Program ComputeBudget111111111111111111111111111111 success',
        'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD invoke [1]',
        'Program log: Instruction: UpdatePricingV2',
        'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD consumed 20398 of 199850 compute units',
        'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success'
    ]},
    {'index': 810, 'log_messages': [
        'Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]',
        'Program log: Instruction: UpdatePriceFeed',
        'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ invoke [2]',
        'Program log: Instruction: PostUpdate',
        'Program 11111111111111111111111111111111 invoke [3]',
        'Program 11111111111111111111111111111111 success',
        'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ consumed 26779 of 68950 compute units',
        'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ success',
        'Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT consumed 43829 of 85000 compute units',
        'Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT success',
        'Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ invoke [1]',
        'Program log: Instruction: CloseEncodedVaa',
        'Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ consumed 2559 of 41171 compute units',
        'Program HDwcJBJXjL9FpJ7UBsYBtaDjsBUhuLCUYoz3zr8SWWaQ success',
        'Program ComputeBudget111111111111111111111111111111 invoke [1]',
        'Program ComputeBudget111111111111111111111111111111 success'
    ]},
    # More data...
]

# Convert log messages to strings for vectorization
log_messages_strings = [" ".join(log['log_messages']) for log in data]

# Vectorize the log messages to create binary vectors
vectorizer = CountVectorizer(binary=True).fit_transform(log_messages_strings)
vectors = vectorizer.toarray()

# Calculate the Jaccard similarity between all pairs
similarity_matrix = np.zeros((len(vectors), len(vectors)))

for i, j in combinations(range(len(vectors)), 2):
    similarity_matrix[i, j] = jaccard_score(vectors[i], vectors[j])
    similarity_matrix[j, i] = similarity_matrix[i, j]

# Group log messages by similarity
threshold = 0.5  # Set a similarity threshold
groups = defaultdict(list)

for i in range(len(similarity_matrix)):
    group_found = False
    for group_id, group_items in groups.items():
        if all(similarity_matrix[i, j] > threshold for j in group_items):
            groups[group_id].append(i)
            group_found = True
            break
    if not group_found:
        groups[len(groups) + 1].append(i)

# Display the grouped log messages
for group_id, indices in groups.items():
    print(f"Group {group_id}:")
    for index in indices:
        print(f"Index {data[index]['index']}: {data[index]['log_messages']}")
    print()


Group 1:
Index 209: ['Program ComputeBudget111111111111111111111111111111 invoke [1]', 'Program ComputeBudget111111111111111111111111111111 success', 'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD invoke [1]', 'Program log: Instruction: UpdatePricingV2', 'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD consumed 20398 of 199850 compute units', 'Program ZETAxsqBRek56DhiGXrn75yj2NHU3aYUnxvHXpkf3aD success']

Group 2:
Index 810: ['Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT invoke [1]', 'Program log: Instruction: UpdatePriceFeed', 'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ invoke [2]', 'Program log: Instruction: PostUpdate', 'Program 11111111111111111111111111111111 invoke [3]', 'Program 11111111111111111111111111111111 success', 'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ consumed 26779 of 68950 compute units', 'Program rec5EKMGg6MxZYaMdyBfgwp4d5rB9T1VQH5pJv5LtFJ success', 'Program pythWSnswVUd12oZpeFP8e9CVaEqJg25g1Vtc2biRsT consumed 43829 of 85000 com