#### The following notebook walks you thru the steps of Data Cleaning, Data Engneering and EDA.

___

In [1]:
# priting over 1 line of code within the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
from collections import defaultdict

# Visualization
import missingno
import matplotlib.pyplot as plt
import seaborn as sns

# extra
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'missingno'

In [74]:
# reading df

df = pd.read_csv('../files/blocks_0.csv')

In [None]:
# checking if there are missing values

missingno.matrix(df, figsize = (30,5)) # no missing values

<br>

#### Let's do some data exploration and data cleaning.

___

In [6]:
# visualizing the df

df.head()

Unnamed: 0.1,Unnamed: 0,size,version,block_number,block_timestamp,input_count,output_count,is_coinbase,fee,inputs,outputs
0,0,816,1,679250,2021-04-14 23:51:48+00:00,5,2,False,62493.0,"[{'index': 0, 'spent_transaction_hash': 'b4788...","[{'index': 0, 'script_asm': 'OP_DUP OP_HASH160..."
1,1,224,1,679250,2021-04-14 23:51:48+00:00,1,2,False,13560.0,"[{'index': 0, 'spent_transaction_hash': 'ff2a7...","[{'index': 0, 'script_asm': 'OP_HASH160 ad2c3a..."
2,2,224,1,679250,2021-04-14 23:51:48+00:00,1,2,False,17920.0,"[{'index': 0, 'spent_transaction_hash': '6d452...","[{'index': 0, 'script_asm': 'OP_HASH160 d805ef..."
3,3,368,2,679250,2021-04-14 23:51:48+00:00,1,2,False,14098.0,"[{'index': 0, 'spent_transaction_hash': '8a56b...","[{'index': 0, 'script_asm': '0 a88573af66089b1..."
4,4,226,1,679250,2021-04-14 23:51:48+00:00,1,2,False,17479.0,"[{'index': 0, 'spent_transaction_hash': '45a2e...","[{'index': 0, 'script_asm': 'OP_DUP OP_HASH160..."


In [7]:
df.shape
df.info()

(361867, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361867 entries, 0 to 361866
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       361867 non-null  int64  
 1   size             361867 non-null  int64  
 2   version          361867 non-null  int64  
 3   block_number     361867 non-null  int64  
 4   block_timestamp  361867 non-null  object 
 5   input_count      361867 non-null  int64  
 6   output_count     361867 non-null  int64  
 7   is_coinbase      361867 non-null  bool   
 8   fee              361867 non-null  float64
 9   inputs           361867 non-null  object 
 10  outputs          361867 non-null  object 
dtypes: bool(1), float64(1), int64(6), object(3)
memory usage: 28.0+ MB


### Columns Description:

- |__Unnamed: 0__| -> Redundant column of the index. _'int64'_
- |__Hash__| -> The hash of this transaction. _'int64'_
- |__Size__| -> The size of block data in bytes. _'int64'_
- |__Virtual size__| -> The virtual transaction size (differs from size for witness transactions). _'int64'_
- |__Version__| -> Protocol version specified in block which contained this transaction. _'object'_
- |__Lock time__| -> Earliest time that miners can include the transaction in their hashing of the Merkle root to attach it in the latest. _'object'_
- |__Block hash__| -> Hash of the block which contains this transaction. _'int64'_
- |__Block number__| -> Number of the block which contains this transaction. _'int64'_
- |__block timestamp__| -> Timestamp of the block which contains this transaction. _'object'_
- |__Block timestamp month__| -> Month of the block which contains this transaction. _'int64'_
- |__Input count__| -> The number of inputs in the transaction. _'int64'_
- |__Output count__| -> The number of outputs in the transaction. _'int64'_
- |__Input value__| -> Total value of inputs in the transaction in satoshis. _'int64'_
- |__Output value__| -> Total value of outputs in the transaction in satoshis. _'int64'_
- |__Is coinbase__| -> Max amount of sets that can be played in a match. _'bool'_
- |__Fee__| -> The fee paid by this transaction in sats. _'float64'_
- |__Inputs__| -> Transaction inputs. _'object'_
- |__Outputs__| -> Transaction outputs. _'object'_

<br>

#### Checking all columns and changing the type in case it's needed

In [8]:
# index column and Unnamed

df.index
df['Unnamed: 0']

RangeIndex(start=0, stop=361867, step=1)

0              0
1              1
2              2
3              3
4              4
           ...  
361862    361862
361863    361863
361864    361864
361865    361865
361866    361866
Name: Unnamed: 0, Length: 361867, dtype: int64

In [9]:
# dropping redundant columns

df.drop(columns= 'Unnamed: 0', inplace= True)

<br>

In [10]:
df['version'].dtype

dtype('int64')

In [11]:
# most nodes are using the version 1

df['version'].value_counts()

1    231187
2    130680
Name: version, dtype: int64

<br>

In [12]:
# blocks are for Apr 14th

df['block_number'].min()
df['block_number'].max()

679250

679422

In [None]:
df['block_number'].value_counts()

In [None]:
# amount of tx per block

sns.displot(x= 'block_number', data= df).set_xticklabels(rotation=30);

<br>

In [None]:
# converting to proper type

df['block_timestamp']= pd.to_datetime(df.block_timestamp).dt.tz_localize(None)

In [None]:
df['block_timestamp'].min()
df['block_timestamp'].max()

In [None]:
df['block_number'].value_counts()

In [None]:
sns.lineplot(x= 'block_timestamp', y= 'block_number', data= df)

<br>

<br>

In [None]:
# big majority of tx have 1 input

len(df['input_count'].value_counts())

In [None]:
len(df[df['input_count']== 1])

In [None]:
small_inp= df[df['input_count'] < 5]

In [None]:
len(small_inp)

In [None]:
small_inp['input_count'].value_counts(normalize= True)

<br>

<br>

In [None]:
# same happens with output... unless the UTXO is spent completely most outputs are 2 cuz of the change address
# prolly very big numbers of inputs/outputs come from mixers

df['output_count'].value_counts()

In [83]:
df = df[:100000]

In [206]:
def clean_trash(x, index):
    y = x[index]
    
    if y == "[]":
        return x
    chars_to_replace = "[{'}]\n( "

    words_to_replace = ["array", "dtype=object", "Decimal", ")index"]

    
    for char in chars_to_replace:
        y = y.replace(char, "")

    for word in words_to_replace:
        y = y.replace(word, "")
        
    y = y.replace(")", "")

    inputs_dict = defaultdict(list)
    cursed_text = y.split(":")
    inputs_dict[cursed_text[0]].append(cursed_text[1][0])
    for i, thing in enumerate(cursed_text):
        if i == 0:
            continue
        thing = thing.split(",")
        
        if i == 1:
            last_thing = thing[1]
            #inputs_dict[last_thing] = []
        else:
            try:
                thing[0] = int(thing[0])
            except ValueError:
                pass
            
            inputs_dict[last_thing].append(thing[0])
            try:
                last_thing = thing[1]
            except IndexError as error:
                inputs_dict["values"].append(thing[0])

    inputs_dict["index"] = list(range(len(inputs_dict["required_signatures"])))
    inputs_dict["values"] = [float(value.replace("E", "e").split(".")[0]) for value in inputs_dict["values"]]
    inputs_dict["hash"] = [x[1]] * len(inputs_dict["index"])
    [inputs_dict.pop(key) for key in list(inputs_dict.keys()) if key not in ["hash", "sequence", "spent_output_index", "spent_transaction_hash", "type", "values", "addresses"]]

    return inputs_dict

In [207]:
outputs = df.apply(clean_trash, axis=1, index=-1) # parse "outputs" columns
inputs = df.apply(clean_trash, axis=1, index=-2) # parse "inputs" columns

In [213]:
inputs

Unnamed: 0,spent_transaction_hash,spent_output_index,sequence,type,addresses,values,hash
0,b836bc9e4e2eb34520dded6a97db44b1a8edd86c017b6d...,63,4294967295,pubkeyhash,1GVU5B4LeQ33Q3VrnLhmDAfUP8EP3f9mbJ,117340.0,114e784709ebffef55d7d854d74108854c30e8d033f61f...
1,4c127c772bf484f2f9203eeac7e0e4b9bb029af6a35bb9...,60,4294967295,pubkeyhash,1CVbrdg7ZmneNXuHeFk6imRXxEba35Z31,172954.0,114e784709ebffef55d7d854d74108854c30e8d033f61f...
2,4b4f0a9e039197b218a5ef967f1e9b46ab6862291334aa...,73,4294967295,pubkeyhash,1PvpJpJRki3zMFH52dqRrfwchtovE9SGXT,133427.0,114e784709ebffef55d7d854d74108854c30e8d033f61f...
3,cac79e959c266d24f5c3f26696a4daae5a9e792a7eb930...,88,4294967295,pubkeyhash,1LV8a54q6WDVxJSqt1S1KGFJPktJMJsEva,58374.0,114e784709ebffef55d7d854d74108854c30e8d033f61f...
4,bc08c4713fd0646d356fcac988e372bc24ec0dd81056b5...,0,4294967295,pubkeyhash,1AfTy17uYsMT7iuEWiXfipm3HtRxA94jL9,166681.0,114e784709ebffef55d7d854d74108854c30e8d033f61f...
...,...,...,...,...,...,...,...
0,8bc9cd25781ab02a79c7a1acc4c4dd9270b9109b0fc961...,4,4294967295,witness_v0_keyhash,bc1qdgrltucvjezsa7vfpp939ag535nwvmvk9a8xh3,14621700.0,ec92092f644fcfae6b5558dab732bf2248549c3d050bc5...
1,a247415306b03ff166a5a99852f75a7538ac81b2a2182e...,5,4294967295,pubkeyhash,1LEt2oo7DwSvouFJ16y9skgUj8vYmeRe7K,2960000.0,ec92092f644fcfae6b5558dab732bf2248549c3d050bc5...
0,b5aecc97faed338d4d53a7ad71ab80e4a702eb3f07725e...,1,4294967295,pubkeyhash,1FubPwUDuCSTHQi1767x1ukxnxqwXVnxju,24892436.0,891b2cbd155f4fc05843efe3111ac75eb607d5437d5673...
0,8fe4c7600619a6bb84a3b2fe40853ca1bf7626bd7833f6...,0,4294967295,pubkeyhash,1HUMRaengEURyVbZZvjRFPHDFzVEFVgcU7,12838719.0,387915f77f53295ddc4375a87e912479e9bad6b75f6783...


In [209]:
import time
df_outputs = []
df_inputs = []

start = time.time()

for i in range(outputs.shape[0]):
    df_outputs.append(pd.DataFrame.from_dict(outputs.iloc[i]))

print(f"Outputs: {time.time() - start}")

start = time.time()

for i in range(inputs.shape[0]):
    
    if len(inputs.iloc[i].keys()) == 7: # remove entries without inputs (rewards for mining a block)
        df_inputs.append(pd.DataFrame.from_dict(inputs.iloc[i]))

print(f"Inputs: {time.time() - start}")


Inputs: 68.76720547676086


In [210]:
start = time.time()
outputs = pd.concat(df_outputs)
print(f"Outputs: {time.time() - start}")
start = time.time()
inputs = pd.concat(df_inputs)
print(f"Inputs: {time.time() - start}")

Inputs: 90.68980288505554


In [212]:
del(df_inputs)
del(df_outputs)

In [253]:
import numpy as np

def build_mask(df, values, key):
    masks = [df[key] == v for v in values]
    mask = np.zeros(masks[0].shape[0])
    for m in masks:
        mask = mask | m
    
    return mask

addresses = ["1GVU5B4LeQ33Q3VrnLhmDAfUP8EP3f9mbJ", "bc1qdgrltucvjezsa7vfpp939ag535nwvmvk9a8xh3"]

while True:

    input_mask = build_mask(inputs, addresses, "addresses")

    hash_values = inputs[input_mask]["hash"].values.tolist()
    output_mask = build_mask(outputs, hash_values, "hash")


    addresses = outputs[output_mask]["addresses"].values.tolist()
    print(outputs[output_mask])

         type                           addresses      values  \
0  scripthash  32Poq8NJmBKDt7eMGKDeTPBc3iw7jGZ3sf   1964880.0   
1  scripthash  3HL1QmiZaAT85uRXuNGq2y7rYR4thhBubc     50914.0   
0  pubkeyhash  1Q8FRidf61icWzBgivWVXHEyvAmv1ySmuH  17578865.0   

                                                hash  
0  114e784709ebffef55d7d854d74108854c30e8d033f61f...  
1  114e784709ebffef55d7d854d74108854c30e8d033f61f...  
0  ec92092f644fcfae6b5558dab732bf2248549c3d050bc5...  


IndexError: list index out of range