# Introduction

In this report, we are going to analyze the bitcoin dataset.

## Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import Dataset

In [24]:
bitcoin = pd.read_csv('datasets/bitcoin_train.csv')

In [4]:
bitcoin.head()

Unnamed: 0.1,Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,0,1BpvJgUs7UprQu9z8fLsP7pFvFcCscHRCV,2011,287,2,0.25,1,0,2,300950000.0,white
1,1,1EnSeTPjMxZm9X9iQDYmMUDoLQQ3ouDN6F,2015,77,0,1.0,1,0,1,48200000.0,white
2,2,1mwkhYHeoqGBkVW84yFpYCSqRDt5TWSBQ,2011,164,52,0.000977,23,0,2,23495820000.0,white
3,3,19XUCsxgpHZGXKLgVMpdoyZqcFdeM3pGeE,2014,86,144,1e-06,1555,1152,2,95812740.0,white
4,4,14Ef6MGSYLEbigo55CpPBGEGSGYwwB7xhY,2015,261,6,0.25,1,0,2,34240240.0,white


In [5]:
bitcoin.shape

(2333357, 11)

In [6]:
bitcoin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2333357 entries, 0 to 2333356
Data columns (total 11 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   address     object 
 2   year        int64  
 3   day         int64  
 4   length      int64  
 5   weight      float64
 6   count       int64  
 7   looped      int64  
 8   neighbors   int64  
 9   income      float64
 10  label       object 
dtypes: float64(2), int64(7), object(2)
memory usage: 195.8+ MB


In [7]:
bitcoin.describe()

Unnamed: 0.1,Unnamed: 0,year,day,length,weight,count,looped,neighbors,income
count,2333357.0,2333357.0,2333357.0,2333357.0,2333357.0,2333357.0,2333357.0,2333357.0,2333357.0
mean,1166678.0,2014.476,181.4778,45.04554,0.5456381,722.378,238.7287,2.213161,4383004000.0
std,673582.3,2.257312,103.9893,59.00348,3.652788,1689.861,967.0131,18.8823,152021100000.0
min,0.0,2011.0,1.0,0.0,1.420108e-90,1.0,0.0,1.0,30000000.0
25%,583339.0,2013.0,92.0,2.0,0.02152083,1.0,0.0,1.0,74340000.0
50%,1166678.0,2014.0,181.0,8.0,0.25,1.0,0.0,2.0,200000000.0
75%,1750017.0,2016.0,271.0,108.0,0.8794643,57.0,0.0,2.0,994066900.0
max,2333356.0,2018.0,365.0,144.0,1943.749,14497.0,14496.0,12920.0,49824470000000.0


# Data Cleaning


In [9]:
bitcoin.columns

Index(['Unnamed: 0', 'address', 'year', 'day', 'length', 'weight', 'count',
       'looped', 'neighbors', 'income', 'label'],
      dtype='object')

In [25]:
bitcoin = bitcoin.drop(['Unnamed: 0','address'],axis=1)
bitcoin.head()

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,label
0,2011,287,2,0.25,1,0,2,300950000.0,white
1,2015,77,0,1.0,1,0,1,48200000.0,white
2,2011,164,52,0.000977,23,0,2,23495820000.0,white
3,2014,86,144,1e-06,1555,1152,2,95812740.0,white
4,2015,261,6,0.25,1,0,2,34240240.0,white


In [26]:
bitcoin.isnull().sum()

year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

# Data Visualization

In [77]:
bitcoin.index

RangeIndex(start=0, stop=2333357, step=1)

In [92]:
bitcoin.loc[~(bitcoin['label'] == 'white')].groupby('label').sum()['count'].sort_values(ascending=False)

label
Locky                  5542529
Cerber                 5402451
CryptoWall             4198413
CryptoLocker           2244009
CryptXXX               1535356
DMALockerv3             277538
DMALocker               205529
NoobCrypt               123353
WannaCry                106545
SamSam                   45609
Globev3                  38764
Globe                    31348
XTPLocker                20620
EDA2                     19376
GlobeImposter            18007
APT                      14439
Razy                      9683
XLockerv5.0               6332
CryptConsole              5822
XLocker                   4511
Jigsaw                    3617
KeRanger                  3347
Flyper                    2916
CryptoTorLocker2015       2837
ComradeCircle             1241
JigSaw                      10
VenusLocker                  6
Sam                          1
Name: count, dtype: int64

In [93]:
# top three ransom labels that have the most ransom transactions: Locky, Cerber, CryptoWall

In [127]:
num_white = bitcoin.loc[(bitcoin['label'] == 'white')].shape[0]
num_ransom = bitcoin.shape[0] - num_white
basic = pd.DataFrame({'category':['white', 'ransom'], 'count':[num_white, num_ransom]})
basic

Unnamed: 0,category,count
0,white,2300268
1,ransom,33089


In [97]:
locky = bitcoin[bitcoin['label'] == 'Locky']
locky

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,label
978,2016,217,144,1.981965e-05,4908,990,1,250000000.0,Locky
1165,2016,281,4,1.250000e-01,2,0,2,300000000.0,Locky
1517,2016,277,10,3.125000e-02,2,0,1,400000000.0,Locky
2039,2016,75,4,2.500000e-01,1,0,2,50000000.0,Locky
2307,2016,47,112,2.365264e-10,3,0,1,100000000.0,Locky
...,...,...,...,...,...,...,...,...,...
2331203,2016,218,8,3.125000e-01,2,0,1,250000000.0,Locky
2331727,2016,89,6,1.319444e-01,5,0,1,300000000.0,Locky
2331811,2016,228,4,1.250000e-01,1,0,1,200000000.0,Locky
2332263,2016,256,2,7.500000e-01,1,1,2,395400000.0,Locky


# Data Analysis

# Predictive Model

# Proposal

# Conclusion