## Imports

In [261]:
# Make the good imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

## Loading our DataFrame

In [262]:
#load the data
df = pd.read_csv('timeseries/blockchain_by_actor.csv', index_col=0, parse_dates=True)
df.tail()



Unnamed: 0,year,month,day,identity,received,nb_received,date,sum_fee,mean_fee_for100,nb_transactions,sent,self_spent,self_spent_estimated,nb_spent,spent
90398,0.0,0.0,0.0,BTCJam.com,0.0,0.0,2017-06-17,145053.0,8.643929,1.0,1387984.0,0.0,0.0,2.0,1387984.0
90399,0.0,0.0,0.0,7277,0.0,0.0,2017-06-18,26600900.0,49.327293,266.0,725280.0,0.0,0.0,532.0,725280.0
90400,0.0,0.0,0.0,70063369,0.0,0.0,2017-06-22,284134.0,0.301961,1.0,93527891.0,0.0,0.0,1.0,93527891.0
90401,0.0,0.0,0.0,Loanbase.com,0.0,0.0,2017-06-29,131862.0,1.415791,1.0,9049938.0,0.0,0.0,2.0,9049938.0
90402,0.0,0.0,0.0,7277,0.0,0.0,2017-06-29,39601350.0,49.327299,396.0,1079730.0,0.0,0.0,792.0,1079730.0


We notice that there is a problem with the year, month, day columns and on several data types in the raw DataFrame.
Let's clean it in order to work in good conditions.

## Cleaning our data

In [263]:
df.dtypes

year                    float64
month                   float64
day                     float64
identity                 object
received                float64
nb_received             float64
date                     object
sum_fee                 float64
mean_fee_for100         float64
nb_transactions         float64
sent                    float64
self_spent              float64
self_spent_estimated    float64
nb_spent                float64
spent                   float64
dtype: object

In [264]:
# convert year, month and day to int
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df['day'] = df['day'].astype(int)

# convert the date to datetime in year-month-day format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# convert identity to string
df['identity'] = df['identity'].astype(str)

# convert nb_transactions, nb_received, nb_spent to int
df['nb_transactions'] = df['nb_transactions'].astype(int)
df['nb_received'] = df['nb_received'].astype(int)
df['nb_spent'] = df['nb_spent'].astype(int)


In [265]:
# get the year, month and day from the date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [266]:
# save the df to a csv file
df.to_csv('timeseries/blockchain_by_actor_cleaned.csv')

# Analysis

In [267]:
# biggest spenders in df
biggest_spenders = df.groupby('identity')['spent'].sum().sort_values(ascending=False).head(100)

# plot the biggest spenders with plotly
# with x as the identity and y as the spent
# with a title and a label for the x and y axis
fig = px.bar(biggest_spenders, x=biggest_spenders.index, y=biggest_spenders.values, title='Biggest spenders in the bitcoin blockchain', labels={'x':'Identity', 'y':'Spent'})
fig.show()

In [268]:
# biggest receivers
biggest_receivers = df.groupby('identity')['received'].sum().sort_values(ascending=False).head(100)

fig = px.bar(biggest_receivers, x=biggest_receivers.index, y=biggest_receivers.values, title='Biggest receivers in the bitcoin blockchain', labels={'x':'Identity', 'y':'Received'})
fig.show()

In [269]:
# subplot of 2 pie plot with plotly with the biggest spenders and the biggest receivers

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=biggest_spenders.index, values=biggest_spenders.values, name="Biggest spenders"), 1, 1)
fig.add_trace(go.Pie(labels=biggest_receivers.index, values=biggest_receivers.values, name="Biggest receivers"), 1, 2)
fig.update_traces(textposition='inside')
fig.update_layout(title_text="Biggest spenders and receivers in the bitcoin blockchain")
fig.show()


In [270]:
# get the names in a list of the 3 best identities in terms of spent column
best_spent = df.groupby('identity')['spent'].sum().sort_values(ascending=False).head(3).index.tolist()

fig = px.line(df[df['identity'].isin(best_spent)].groupby(['date', 'identity'])['spent'].sum().unstack().rolling(30).mean(), title='Evolution of the biggest spenders', labels={'value':'spent'}, height=600, width=1000)
fig.show()

# Etude des frais de transaction payés: Qui paye le mieux en moyenne ?

Regardons comment les frais de transaction moyens par transaction (mean_fee_for100) varient entre les acteurs.

In [271]:
# Meilleur payeur en termes de frais moyens par transaction
# get the names in a list of the 3 best identities in terms of mean_fees column
df.groupby('identity')['mean_fee_for100'].mean().sort_values(ascending=False).head(3)


identity
7277           15.871148
999Dice.com     5.511470
419             5.106580
Name: mean_fee_for100, dtype: float64

In [272]:
# full list: best fee/transaction over the (2015-2017) period
# we can see a signiform difference between the top 3 and the others

best_payers = df.groupby('identity')['mean_fee_for100'].mean().sort_values(ascending=False) 
best_payers

identity
7277             15.871148
999Dice.com       5.511470
419               5.106580
0                 4.907442
18972             4.186133
                   ...    
Paymium.com       0.011627
5162              0.009440
SlushPool.com     0.007377
69697250          0.005614
BTCC.com          0.002298
Name: mean_fee_for100, Length: 100, dtype: float64

In [273]:
# plot the best payers
fig = px.bar(best_payers.head(100), title='Best payers in terms of mean fees per transaction', labels={'value':'mean fees/transaction(satoshi)'}, height=600, width=1000)
fig.show()

## Quelle est la relation entre les frais de transaction totaux payés (sum_fee) et le nombre de transactions effectuées (nb_transactions) pour chaque acteur ?
Analysons la corrélation entre ces deux variables pour voir si les acteurs qui effectuent plus de transactions paient également plus de frais.

In [274]:
# let's compute the correlation between the sum_fee and nb_transactions
df[['sum_fee', 'nb_transactions']].corr()

Unnamed: 0,sum_fee,nb_transactions
sum_fee,1.0,0.409517
nb_transactions,0.409517,1.0


Une corrélation de 0,4 signifie qu'il y a une relation positive modérée entre deux variables. Lorsque la valeur d'une variable augmente/diminue, la valeur de l'autre variable a tendance à augmenter/diminuer également, mais pas de manière systématique.

On ne peut pas affirmer de ce fait que les acteurs qui font le plus grand nombres de transactions paieront un plus grand nombre de frais.

## Y a-t-il une relation entre la proportion des transactions auto-émises et les frais de transaction payés par les acteurs ?
Vérifions si les acteurs qui auto-émettent une plus grande proportion de transactions ont tendance à payer des frais de transaction différents de ceux qui envoient principalement des fonds à des tiers.

In [275]:
# compute the self spent proportion
df['self_spent_proportion'] = df['self_spent'] / df['sent'] * 100

# display the top 5 identities in terms of self spent proportion
df.groupby('identity')['self_spent_proportion'].mean().sort_values(ascending=False).head(5)


identity
Paymium.com               81.309526
ePay.info_CoinJoinMess    73.112700
Bitbond.com               68.926661
SimpleCoin.cz             68.501003
BitZillions.com           63.335812
Name: self_spent_proportion, dtype: float64

In [276]:
df_self_spent_prop_sorted = df.groupby('identity')['self_spent_proportion'].mean().sort_values(ascending=False)

In [277]:
# plot the df_test with a bar plot with plotly
# with the title 'Top 50 identities in terms of self spent proportion'
# with the figsize (15, 10)
# with the x axis title 'identity'
# with the y axis title 'self spent proportion (in %)'

fig = px.bar(df_self_spent_prop_sorted.head(100), title='Top 50 identities in terms of self spent proportion', labels={'index': 'identity', 'value': 'self spent proportion (in %)'}, height=600)
fig.show()


We can observe the nature of the actors in terms of spending. As we can see here, for some actors the proportion of self_spending is very high.

It is difficult to predict what are the intentions of such actors. Some might manage exchange platforms, that opper intern transactions for several reasons.

Others might use mixing techniques to hide their transactions, or to dissimulate the origin of their funds.

In [278]:
df_mean_fee_per_transaction = df.groupby('identity')['mean_fee_for100'].mean()

df_self_spent_prop_raw = df.groupby('identity')['self_spent_proportion'].mean()

df_mean_fee_per_transaction.corr(df_self_spent_prop_raw)


-0.049798379838131165

Il n'y a donc pas de relation linéaire claire entre ces deux variables. En d'autres termes, le fait qu'un acteur émette une proportion plus élevée ou plus faible d'auto transactions n'a pas d'impact direct et prévisible sur les frais moyens qu'il paie par transaction.