In [None]:
import os
import warnings
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
import pdb
import datetime
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, log_loss, plot_roc_curve, f1_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import matplotlib
from prettytable import PrettyTable
from pprint import pprint

warnings.filterwarnings("ignore")

## Exploratory Data Analysis

We'll use the download_data script in order to download our BitcoinHeist data.


In [None]:
!python download_data.py

### Loading the data

We have 2916697 examples in hand that we split into training and testing datasets as follows :


*   The training dataset contains 75% of the original dataset. It is accessible in CSV format under the train folder.

*   The testing dataset contains 25% of the original dataset. It is accessible in CSV format under the test folder.



The next function allows us to load the training and testing datasets.

In [1]:
from glob import glob

def get_file_list_from_dir(*, path, datadir):
    data_files = sorted(glob(os.path.join(path, "data", datadir, "*.csv.gz")))
    return data_files

In [None]:
#Loading the training dataset
train_files = get_file_list_from_dir(path=".", datadir="train")
dtrain = pd.concat((pd.read_csv(f) for f in train_files))

#Loading the testing dataset
test_files = get_file_list_from_dir(path=".", datadir="test")
dtest = pd.concat((pd.read_csv(f) for f in test_files))


In [None]:
dtrain.head()

In [2]:
dtrain.shape # the training dataframe has 2 187 523 rows and 10 columns

In [None]:
dtest.head()

In [None]:
dtest.shape # the testing dataframe has 729 174 rows and 10 columns

In [None]:
print(dtrain.columns) #the names of the columns

Index(['address', 'year', 'day', 'length', 'weight', 'count', 'looped',
       'neighbors', 'income', 'label'],
      dtype='object')


In [None]:
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


In [3]:
dtrain.describe()

In [4]:
dtrain.describe(include=object) #so that we can see some descriptive statistices for the object columns as well

As we've seen from the few descriptions of the dataset above, some columns are of type float, some integer and some object. 

We'll now proceed to do some exploratory data analysis to have more insight on our dataset.

### Feature exploration

In this part we'll try to further understand the features in our dataset, mainly the different classes of the labels which is our target variable.

We'll also see if there are any missing values.

In [None]:
#checking which columns contain NaN values
dtrain.isna().any()

address      False
year         False
day          False
length       False
weight       False
count        False
looped       False
neighbors    False
income       False
label        False
dtype: bool

In [15]:
dtrain.year.unique() #the information that's available is from .. consecutive years from ... to ...


In [14]:
dtrain.year.value_counts() #we have almost the same number of info for each year

In [None]:
df.looped.unique().shape


(10168,)

In [None]:
df.neighbors.unique().shape

(814,)

In [None]:
df['count'].unique().shape

(11572,)

We'll focus on our target variable "label" since for the features, there is a lot of variability of the values taken. Thus it won't be helpful to study their unique values. 

We can however see how their values change with regard to the class of the label.

In [None]:
dtrain.label.unique().shape

(29,)

In [12]:
dtrain.label.unique()

For the label, we have 29 different values which we can classify in 2 major categories. The white category and the ransomware category containig the rest of the values that are not white. 

In [5]:
dtrain.loc[dtrain["label"]=="white", "label"].value_counts()

In [6]:
dtrain.loc[dtrain["label"]!="white", "label"].value_counts()

In [None]:
df1 = dtrain[dtrain["label"]!="white"]["label"]

In [7]:
len(df1)

We have .... white labels vs .... non-white lables => imbalanced data.

This should be taken into consideration when choosing
the performance evaluation metric.

In [None]:
df2 = dtrain[dtrain["label"]!="white"]["year"]

In [13]:
df2.value_counts() 

We can see that the number of Ransomware varies greatly from one year to another. There are years where it is much greater compared to the rest. This can be seen in the case of the year 2016 where the number of Ransomware peaked.

In [None]:
df2 = dtrain[dtrain["label"]!="white"]["neighbors"]


In [8]:
#df2.value_counts() #most of the ransomware cases are when the value of neighbors is either 2 or 1