In [152]:
import pandas as pd

In [153]:
inputs = pd.read_csv('./data/inputs.csv', header=None, names=['id','tx_id','sig_id','output_id'])
transactions = pd.read_csv('./data/transactions.csv', header=None, names=['tx_id','block_id'])
outputs = pd.read_csv('./data/outputs.csv', header=None, names=['id','tx_id','pk_id','value'])

In [154]:
invalid_transactions = set()

Look for the double spending transactions:

In [155]:
double_spending_transactions = inputs[(inputs['output_id'].duplicated(keep=False)) & (inputs['output_id']!=-1)]
double_spending_transactions

Unnamed: 0,id,tx_id,sig_id,output_id
8665,8666,8231,7941,7998
12819,12820,12152,7941,7998
33112,33113,30446,21807,21928
33113,33114,30446,21807,21928
76746,76747,61843,138980,65403
76749,76750,61845,138980,65403
275613,275614,204751,163625,249860
279608,279609,207365,163625,249860


In [156]:
for _,transaction in double_spending_transactions.iterrows():
    invalid_transactions.add(transaction['tx_id'])

Now we have to check that the sum(outputs) >= sum(inputs)

First we get a view in which we have each transaction associated with its input amount

In [157]:
inputs_with_value = pd.merge(inputs, outputs, left_on='output_id', right_on='id')
inputs_with_value.drop(columns=['output_id','pk_id','sig_id','id_x','id_y','tx_id_y'], inplace=True)
inputs_with_value.rename(columns={'tx_id_x':'transaction_A'}, inplace=True)

This dataframes show the input amount of each transaction

In [158]:
inputs_with_value = inputs_with_value.groupby('transaction_A',as_index=False).sum()
inputs_with_value = inputs_with_value.rename(columns={'value':'input_value'})
inputs_with_value

Unnamed: 0,transaction_A,input_value
0,172,5000000000
1,184,4000000000
2,186,3000000000
3,188,2900000000
4,193,100000000
...,...,...
116602,216618,13836000000
116603,216622,5000000000
116604,216623,300000000
116605,216624,1000000


let's do the same for the outputs:

In [159]:
transactions_balance = pd.merge(inputs_with_value, outputs, left_on='transaction_A', right_on='tx_id')
transactions_balance = transactions_balance.drop(columns=['pk_id','transaction_A','id',])
transactions_balance = transactions_balance.rename(columns={'value':'output_value'})
transactions_balance = transactions_balance.groupby(['tx_id','input_value'], as_index=False)['output_value'].sum()

In [160]:
pd.merge(inputs_with_value, transactions, left_on='transaction_A', right_on='tx_id')

Unnamed: 0,transaction_A,input_value,tx_id,block_id
0,172,5000000000,172,170
1,184,4000000000,184,181
2,186,3000000000,186,182
3,188,2900000000,188,183
4,193,100000000,193,187
...,...,...,...,...
116602,216618,13836000000,216618,100013
116603,216622,5000000000,216622,100016
116604,216623,300000000,216623,100016
116605,216624,1000000,216624,100016


The last thing to do is to check if the output is bigger than the input

In [161]:
transactions_balance = transactions_balance[transactions_balance['input_value'] < transactions_balance['output_value']]
transactions_balance

Unnamed: 0,tx_id,input_value,output_value
28023,100929,5000000000,5000000010


In [162]:
for _,transaction in transactions_balance.iterrows():
    invalid_transactions.add(transaction['tx_id'])

Now we are going to check if there's some transaction where the output value is not >= 0:

In [163]:
negative_output = outputs[outputs['value'] < 0]
negative_output
for _,transaction in negative_output.iterrows():
    invalid_transactions.add(transaction['tx_id'])

Now we are going to see if for every input transaction there exist at least one output transaction

In [164]:
inputs_tx_id  = inputs.groupby('tx_id', as_index=False).sum()['tx_id']
outputs_tx_id = outputs.groupby('tx_id', as_index=False).sum()['tx_id']

pd.concat([inputs_tx_id,outputs_tx_id]).drop_duplicates(keep=False)

Series([], Name: tx_id, dtype: int64)

This result shows that each input transaction has at least one output transaction.

Removing Invalid value for the outputs

Checking if the sig_id of the inputs transaction correspond to the pk_id of the transaction where the output belongs

In [165]:
signature_check =  pd.merge(inputs, outputs, left_on='output_id', right_on='id')
signature_check = signature_check.drop(columns=['id_x','tx_id_y','output_id'])
signature_check[signature_check['sig_id'] != signature_check['pk_id']]

invalid_transactions.add(138278)
invalid_transactions

{8231, 12152, 30446, 61843, 61845, 100929, 105281, 138278, 204751, 207365}

There're 2 nonvalid signatures. Since sig_id=-1 means that a non-standard script has been used, the only one to be removed is the tx_id=138278

Now we are going to remove from the dataframe "transactions" all the transactions which don't have a counterpart in inputs/ouputs.
Since we've showed that for every input there's a corresponding output we are going to perform the check only on one dataframe.

In [166]:
inputs_tx_id  = inputs.groupby('tx_id', as_index=False).sum()['tx_id']
transactions_tx_id = transactions.groupby('tx_id', as_index=False).sum()['tx_id']
inputs_tx_id.shape[0]
invalid_transactions

{8231, 12152, 30446, 61843, 61845, 100929, 105281, 138278, 204751, 207365}

In [167]:
cb_transactions = inputs[inputs['output_id']==-1]
cb_transaction_output = pd.merge(cb_transactions,outputs, left_on='tx_id', right_on='tx_id').drop(columns=['id_x','sig_id','output_id','id_y','pk_id'])
invalid_transactions.update(cb_transaction_output[cb_transaction_output['value'] != 5000000000]['tx_id'].values)

In [168]:
new_invalid_transactions = pd.Series(list(invalid_transactions)).rename('tx_id')
previous_len = len(invalid_transactions)
print("Actual invalid transactions: {}".format(len(invalid_transactions)))
flag = False
while not flag:
    invalid_output_ids = pd.merge(new_invalid_transactions, outputs,left_on='tx_id', right_on='id')['id']
    new_invalid_transactions = pd.merge(inputs, invalid_output_ids, left_on='output_id', right_on='id')['tx_id']
    previous_len = len(invalid_transactions)
    invalid_transactions.update(new_invalid_transactions)
    flag = previous_len == len(invalid_transactions)
    print("Actual invalid transactions: {}".format(len(invalid_transactions)))
len(invalid_transactions)

Actual invalid transactions: 998
Actual invalid transactions: 1793
Actual invalid transactions: 2326
Actual invalid transactions: 2678
Actual invalid transactions: 2897
Actual invalid transactions: 3041
Actual invalid transactions: 3121
Actual invalid transactions: 3164
Actual invalid transactions: 3190
Actual invalid transactions: 3206
Actual invalid transactions: 3215
Actual invalid transactions: 3222
Actual invalid transactions: 3226
Actual invalid transactions: 3230
Actual invalid transactions: 3234
Actual invalid transactions: 3238
Actual invalid transactions: 3239
Actual invalid transactions: 3240
Actual invalid transactions: 3241
Actual invalid transactions: 3242
Actual invalid transactions: 3243
Actual invalid transactions: 3243


3243

In [169]:
print("Before: a= {}, b= {}, c={}".format(inputs.shape[0],transactions.shape[0],outputs.shape[0]))
print("Number of invalid transaction : {}".format(len(invalid_transactions)))
for value in invalid_transactions:
    inputs.drop(inputs[inputs['tx_id'] == value].index, inplace=True)
    transactions.drop(transactions[transactions['tx_id'] == value].index, inplace=True)
    outputs.drop(outputs[outputs['tx_id'] == value].index, inplace=True)
print("After: a= {}, b= {}, c={}".format(inputs.shape[0],transactions.shape[0],outputs.shape[0]))

Before: a= 292427, b= 216626, c=264310
Number of invalid transaction : 3243
After: a= 265944, b= 213383, c=259842


In [170]:
inputs.to_csv("./data/updated_inputs.csv")
transactions.to_csv("./data/updated_transactions.csv")
outputs.to_csv("././data/updated_outputs.csv")