In [63]:
import json
import os
import ijson
import pandas as pd
import time
from tqdm import tqdm

def process_single_file(file_path, max_items=None, save_output=False):

    print(f"Processing {os.path.basename(file_path)}...")
    

    block_count = 0
    transaction_count = 0
    blocks_data = []
    
    try:
        with open(file_path, 'r') as f:
            parser = ijson.parse(f)
            
            current_block = None
            item_type = None
            item_data = {}
            array_depth = 0
            item_count = 0
            progress_interval = 1000 
            last_progress_time = time.time()
            
            for prefix, event, value in parser:
                # Track array depth
                if event == 'start_array':
                    array_depth += 1
                elif event == 'end_array':
                    array_depth -= 1
                    if array_depth == 1: 
                        current_block = None
                
                
                if event == 'start_map':
                    item_data = {}
                    item_type = None
                elif event == 'map_key':
                    # check if this is a block or transaction
                    if value == 'number' and prefix.count('.') == 1:
                        item_type = 'block'
                    elif value == 'blockNumber' and prefix.count('.') == 1:
                        item_type = 'transaction'
                elif event == 'end_map':
                    if item_type == 'block':
                        current_block = item_data.get('number')
                        block_count += 1
                        
                        # get relevant block level data
                        block_summary = {
                            'block_number': item_data.get('number'),
                            'timestamp': item_data.get('timestamp'),
                            'gas_used': item_data.get('gasUsed'),
                            'gas_limit': item_data.get('gasLimit'),
                            'base_fee_per_gas': item_data.get('baseFeePerGas'),
                            'transaction_count': len(item_data.get('transactions', [])) if 'transactions' in item_data else 0
                        }
                        blocks_data.append(block_summary)
                        
                        if save_output and len(blocks_data) >= 1000:
                            partial_df = pd.DataFrame(blocks_data)
                            output_file = os.path.basename(file_path).replace('.json', f'_blocks_{block_count-len(blocks_data)}_to_{block_count}.csv')
                            partial_df.to_csv(output_file, index=False)
                            print(f"Saved partial block data to {output_file}")
                            blocks_data = [] 
                        
                    elif item_type == 'transaction':
                        transaction_count += 1
                    
                    item_count += 1
                    
                    current_time = time.time()
                    if item_count % progress_interval == 0 or current_time - last_progress_time > 30:
                        print(f"Processed {item_count} items: {block_count} blocks and {transaction_count} transactions")
                        last_progress_time = current_time
                    
                    if max_items and item_count >= max_items:
                        break
                

                elif prefix.count('.') == 2 and event != 'start_array' and event != 'end_array':
                    field = prefix.split('.')[-1]
                    item_data[field] = value
            


            
            if save_output and blocks_data:
                final_df = pd.DataFrame(blocks_data)
                output_file = os.path.basename(file_path).replace('.json', '_final_blocks.csv')
                final_df.to_csv(output_file, index=False)
                print(f"Saved final block data to {output_file}")


            
            if blocks_data:
                df = pd.DataFrame(blocks_data)
                
                # Calculate stat
                avg_gas_used = df['gas_used'].mean() if 'gas_used' in df.columns else 0
                avg_tx_per_block = df['transaction_count'].mean() if 'transaction_count' in df.columns else 0
                
                
                block_range = f"{df['block_number'].min()} - {df['block_number'].max()}" if 'block_number' in df.columns and not df.empty else "Unknown"
                
                return {
                    'file': os.path.basename(file_path),
                    'block_count': block_count,
                    'transaction_count': transaction_count,
                    'avg_gas_used': avg_gas_used,
                    'avg_transactions_per_block': avg_tx_per_block,
                    'block_range': block_range,
                    'processing_complete': max_items is None
                }
            
            return {
                'file': os.path.basename(file_path),
                'block_count': block_count,
                'transaction_count': transaction_count,
                'note': "No block data was collected",
                'processing_complete': max_items is None
            }
                
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return {
            'file': os.path.basename(file_path),
            'error': str(e)
        }

def chunked_json_reader(file_path, chunk_size=1000):
    with open(file_path, 'r') as f:

        char = f.read(1)
        while char != '[':
            char = f.read(1)
            if not char:
                return
        

        char = f.read(1)
        while char != '[':
            char = f.read(1)
            if not char:
                return
        
 
        buffer = ""
        brace_count = 0
        in_string = False
        escape_next = False
        
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
                
            for char in chunk:
  
                if char == '"' and not escape_next:
                    in_string = not in_string
                elif char == '\\' and in_string and not escape_next:
                    escape_next = True
                    buffer += char
                    continue
                else:
                    escape_next = False
                
                buffer += char
                


                
                if not in_string:
                    if char == '{':
                        brace_count += 1
                    elif char == '}':
                        brace_count -= 1
                        if brace_count == 0:

                            try:
                                yield json.loads(buffer)
                            except json.JSONDecodeError:
                                print(f"Error decoding JSON object: {buffer[:100]}...")
                            
                            buffer = ""
                            
                            char = f.read(1)
                            while char and char not in [',', ']']:
                                char = f.read(1)
                            
                            if char == ']': 
                                char = f.read(1)
                                while char and char not in ['[', ']']:
                                    char = f.read(1)
                                
                                if char != '[': 
                                    return

def process_all_files(file_list, sample_only=True, max_items_per_file=100):

    results = []
    
    for file_path in file_list:
        if os.path.exists(file_path):
            print(f"\nProcessing file {os.path.basename(file_path)}...")
            print(f"File size: {os.path.getsize(file_path) / (1024 * 1024):.2f} MB")
                        
            try:
                result = process_single_file(
                    file_path, 
                    max_items=max_items_per_file if sample_only else None,
                    save_output=not sample_only
                )
                results.append(result)
                
         
                print(f"Completed processing {os.path.basename(file_path)}")
                print(f"Found {result.get('block_count', 0)} blocks and {result.get('transaction_count', 0)} transactions")
                

                if len(results) > 0:
                    intermediate_df = pd.DataFrame(results)
                    intermediate_df.to_csv(f"blockchain_processing_progress_{len(results)}_files.csv", index=False)
                    
            except KeyboardInterrupt:
                print("\n Saving partial result")
                break
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                results.append({
                    'file': os.path.basename(file_path),
                    'error': str(e)
                })
        else:
            print(f"File not found: {file_path}")
    

    summary_df = pd.DataFrame(results)
    print("\nSummary of processed files:")
    print(summary_df)
    
    return summary_df

def extract_specific_data(file_path, target_block=None, target_tx=None, max_search_items=1000):
    print(f"Searching in {os.path.basename(file_path)}...")
    
    try:
        item_count = 0
        
        for item in chunked_json_reader(file_path):
            if target_block is not None and 'number' in item and item['number'] == target_block:
                return {'type': 'block', 'data': item}
            
            if target_tx is not None and 'hash' in item and item['hash'] == target_tx:
                return {'type': 'transaction', 'data': item}
            
            item_count += 1
            if item_count >= max_search_items:
                break
                
        return None
    
    except Exception as e:
        print(f"Error searching {file_path}: {str(e)}")
        return None


if __name__ == "__main__":
    files = [
        "C:/Users/haile/Downloads/170-171/blockTransactions17011001-17012000.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17015001-17020000.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17010001-17011000.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17012001-17015000.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17030001-17050000-004.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17020001-17030000-006.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17090001-17100000-005.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17000000-17010000-001.json",
        "C:/Users/haile/Downloads/170-171/new_blockTransactions17175001-17200000-002.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17150001-17175000-005.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17100000-17125000-004.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17125001-17150000-003.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17175001-17200000-001.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17275001-17300000-004.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17225001-17250000-003.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17200000-17225000-001.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17350001-17400000-002.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17300001-17350000-001.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17250001-17275000-002.json",
        "C:/Users/haile/Downloads/170-171/blockTransactions17400001-17450000",
        "C:/Users/haile/Downloads/170-171/blockTransactions17450001-17500000"
    ]
    

    summary = process_all_files(files, sample_only=False)
    

    summary.to_csv("blockchain_files_complete_summary.csv", index=False)
    print(f"Saved complete summary to blockchain_files_complete_summary.csv")



Processing file blockTransactions17011001-17012000.json...
File size: 360.67 MB
Processing blockTransactions17011001-17012000.json...
Processed 1000 items: 7 blocks and 965 transactions
Processed 2000 items: 13 blocks and 1947 transactions
Processed 3000 items: 19 blocks and 2907 transactions
Processed 4000 items: 26 blocks and 3849 transactions
Processed 5000 items: 32 blocks and 4800 transactions
Processed 6000 items: 40 blocks and 5752 transactions
Processed 7000 items: 46 blocks and 6675 transactions
Processed 8000 items: 52 blocks and 7594 transactions
Processed 9000 items: 59 blocks and 8543 transactions
Processed 10000 items: 65 blocks and 9513 transactions
Processed 11000 items: 70 blocks and 10450 transactions
Processed 12000 items: 75 blocks and 11429 transactions
Processed 13000 items: 81 blocks and 12382 transactions
Processed 14000 items: 88 blocks and 13339 transactions
Processed 15000 items: 94 blocks and 14251 transactions
Processed 16000 items: 100 blocks and 15220 tr

In [57]:
import pandas as pd
import os
import glob

def combine_block_data_files(folder_paths, output_file='combined_blockchain_blocks.csv'):


    all_files = []
    

    for folder in folder_paths:
        block_files = glob.glob(os.path.join(folder, '*blocks*.csv'))
        block_files = [f for f in block_files if 'processing_progress' not in f and 'summary' not in f]
        
        all_files.extend(block_files)
    
    print(f"Found {len(all_files)} block data files")
    

    chunk_size = 50 
    for i in range(0, len(all_files), chunk_size):
        chunk_files = all_files[i:i+chunk_size]
        print(f"Processing chunk {i//chunk_size + 1} with {len(chunk_files)} files")
        

        dfs = []
        
        for file in chunk_files:
            try:
                df = pd.read_csv(file)
                
                
                df['source_file'] = os.path.basename(file)
                
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {file}: {str(e)}")
        
        if dfs:

            chunk_df = pd.concat(dfs, ignore_index=True)
            
    
            if i == 0:
                chunk_df.to_csv(output_file, index=False)
            else:
                chunk_df.to_csv(output_file, mode='a', header=False, index=False)
            
            print(f"Processed {len(dfs)} files in this chunk")
            

            del dfs
            del chunk_df
        
    print(f"Combined block data saved to {output_file}")

    return pd.read_csv(output_file, nrows=1000)

if __name__ == "__main__":
    folders = [
        "C:/Users/haile/Downloads/Eth datasets csv/1st",
        "C:/Users/haile/Downloads/Eth datasets csv/2nd",
        "C:/Users/haile/Downloads/Eth datasets csv/3rd"
    ]
    

    sample_data = combine_block_data_files(folders)
    

    print("\nSample of combined data:")
    print(sample_data.head())

    
    output_size_mb = os.path.getsize('combined_blockchain_blocks.csv') / (1024 * 1024)
    print(f"Combined file size: {output_size_mb:.2f} MB")

Found 38 block data files
Processing chunk 1 with 38 files
Processed 38 files in this chunk
Combined block data saved to combined_blockchain_blocks.csv

Sample of combined data:
   block_number   timestamp  gas_used  gas_limit  base_fee_per_gas  transaction_count                                        source_file
0      17000000  1680911891   9160778   30000000       20582738913                  0  blockTransactions17000000-17010000-001_blocks_...
1      17000001  1680911903   9389175   30000000       19581179064                  0  blockTransactions17000000-17010000-001_blocks_...
2      17000002  1680911915  29993802   30000000       18665624323                  0  blockTransactions17000000-17010000-001_blocks_...
3      17000003  1680911927  11343154   30000000       20997863283                  0  blockTransactions17000000-17010000-001_blocks_...
4      17000004  1680911939   7688030   30000000       20357980347                  0  blockTransactions17000000-17010000-001_blocks_...
