# Parsing of Bitcoin and more

BRAUX Owen and CAMBIER Elliot

## Environment Setup and Raw Data Loading

In [2]:
from pyspark.sql import SparkSession
import os

In [3]:
spark = SparkSession.builder \
    .appName("BDA - Bitcoin Block Parsing") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/14 11:31:05 WARN Utils: Your hostname, OBPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/14 11:31:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/14 11:31:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/14 11:31:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# needed : RDD operations / binaryFiles
sc = spark.sparkContext

# path
blocks_path = "../data/blocks/blocks/blk*.dat"

print(f"Targeting block files at: {blocks_path}")

# Load the binary files into an RDD
raw_blocks_rdd = sc.binaryFiles(blocks_path)

# Validate the load
file_count = raw_blocks_rdd.count()
print(f"Number of 'blk' files loaded: {file_count}")

# Show the file names (keys) to verify we grabbed the right ones
file_names = raw_blocks_rdd.keys().take(5)
print("Sample file names:")
for name in file_names:
    print(name)

Targeting block files at: ../data/blocks/blocks/blk*.dat


                                                                                

Number of 'blk' files loaded: 8


[Stage 2:>                                                          (0 + 2) / 2]

Sample file names:
file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00013.dat
file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00014.dat
file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00015.dat
file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00016.dat
file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00017.dat


                                                                                

## Bitcoin Binary Parser Class

    Helper class to decode raw Bitcoin block headers and transactions.

In [5]:
import binascii
import struct
from datetime import datetime

class BlockchainParser:
    def __init__(self, raw_data):
        self.data = raw_data
        self.offset = 0

    def read_bytes(self, n):
        out = self.data[self.offset : self.offset + n]
        self.offset += n
        return out

    def read_uint32(self):
        return struct.unpack('<I', self.read_bytes(4))[0]

    def read_int32(self):
        return struct.unpack('<i', self.read_bytes(4))[0]

    def read_uint64(self):
        return struct.unpack('<Q', self.read_bytes(8))[0]

    def read_varint(self):
        i = self.read_bytes(1)[0]
        if i < 0xfd:
            return i
        elif i == 0xfd:
            return struct.unpack('<H', self.read_bytes(2))[0]
        elif i == 0xfe:
            return struct.unpack('<I', self.read_bytes(4))[0]
        else:
            return struct.unpack('<Q', self.read_bytes(8))[0]

    def parse_block(self):
        # Verify Magic Bytes (Network identifier)
        magic = self.read_bytes(4)
        if magic != b'\xf9\xbe\xb4\xd9': # Mainnet magic bytes
            return None 
        
        size = self.read_uint32()

        version = self.read_int32()
        prev_block = self.read_bytes(32)[::-1].hex()
        merkle_root = self.read_bytes(32)[::-1].hex()
        timestamp = self.read_uint32()
        bits = self.read_uint32()
        nonce = self.read_uint32()

        tx_count = self.read_varint()
        
        transactions = []
        for _ in range(tx_count):
            transactions.append(self.parse_transaction(timestamp))
            
        return {
            "prev_block_hash": prev_block,
            "timestamp": timestamp,
            "nonce": nonce,
            "n_transactions": tx_count,
            "transactions": transactions
        }

    def parse_transaction(self, block_ts):

        start_offset = self.offset
        
        version = self.read_int32()
        
        # Inputs
        n_inputs = self.read_varint()
        inputs = []
        for _ in range(n_inputs):
            tx_hash = self.read_bytes(32)[::-1].hex()
            vout_idx = self.read_uint32()
            script_len = self.read_varint()
            script_sig = self.read_bytes(script_len)
            sequence = self.read_uint32()
            inputs.append({"prev_tx_hash": tx_hash, "prev_out_idx": vout_idx})
            
        # Outputs
        n_outputs = self.read_varint()
        outputs = []
        total_amount = 0
        for _ in range(n_outputs):
            amount = self.read_uint64() # Satoshi
            pk_script_len = self.read_varint()
            pk_script = self.read_bytes(pk_script_len)
            outputs.append({"amount": amount})
            total_amount += amount
            
        lock_time = self.read_uint32()
        
        # Calculate simplified TxID because it's too long otherwise
        return {
            "block_timestamp": block_ts,
            "n_inputs": n_inputs,
            "n_outputs": n_outputs,
            "total_amount_satoshi": total_amount,
            "total_amount_btc": total_amount / 100000000.0
        }

# Wrapper function to be used by Spark map
def parse_raw_block_file(file_data):
    filename, content = file_data
    parser = BlockchainParser(content)
    blocks = []
    
    # A .dat file can contain multiple blocks
    while parser.offset < len(content):
        try:
            block = parser.parse_block()
            if block:
                blocks.append(block)
            else:
                break # Stop if magic bytes don't match (end of file or padding)
        except Exception:
            break # Stop on parsing error (end of file)
            
    return blocks

## Test of Parsing on a Single File

In [6]:
# Take one file to test our logic
one_file = raw_blocks_rdd.take(1) 

print(f"Testing parser on file: {one_file[0][0]}")

# Apply the parser manually
parsed_blocks = parse_raw_block_file(one_file[0])

print(f"Successfully extracted {len(parsed_blocks)} blocks from this file.")

if len(parsed_blocks) > 0:
    first_block = parsed_blocks[0]
    print("\n Block Structure Sample :")
    print(f"Block Timestamp: {datetime.fromtimestamp(first_block['timestamp'])}")
    print(f"Transaction Count: {first_block['n_transactions']}")
    
    if len(first_block['transactions']) > 0:
        print("\n First Transaction Sample :")
        print(first_block['transactions'][0])

                                                                                

Testing parser on file: file:/home/owenb/big_data/Project_Big_Data_Analytics/data/blocks/blocks/blk00013.dat
Successfully extracted 1133 blocks from this file.

 Block Structure Sample :
Block Timestamp: 2012-06-15 00:35:49
Transaction Count: 413

 First Transaction Sample :
{'block_timestamp': 1339713349, 'n_inputs': 1, 'n_outputs': 1, 'total_amount_satoshi': 5025512500, 'total_amount_btc': 50.255125}
