In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# opening the dataset
dataset = pd.read_csv("../datasets/BitcoinHeistData.csv")
dataset

# extracting licit addresses (41413 rows)
licitAddresses = dataset.loc[dataset['label'] == 'white']

# extracting illicit addresses (2875284 rows)
IllicitAddresses = dataset.loc[dataset['label'] != 'white']

# removing an outlier
IllicitAddresses = IllicitAddresses.loc[IllicitAddresses['weight'] < 100]

#taking a random 100000 rows of the Illicit addresses (so as to not have very imbalanced data)
licit_subset = licitAddresses.sample(100000)

# merging the two classes
Full_Dataset = pd.concat([IllicitAddresses,licit_subset])
DATA = Full_Dataset.reset_index()

# adding a new illicit column : 0 for licit addresses and 1 for illicit
Illicit = []*141412
for i in range(141412):
    if DATA['label'][i] == 'white' :
        Illicit.append(0)
    else :
        Illicit.append(1)
DATA['Illicit'] = Illicit

DATA = DATA.sample(frac=1)
DATA = DATA.reset_index()
DATA = DATA.dropna()

#droping unnecessary columns
features = DATA.drop(['level_0','index','address','year','day','label'], axis=1)
features


labels = np.array(features['Illicit'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('Illicit', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)






Training Features Shape: (106059, 6)
Training Labels Shape: (106059,)
Testing Features Shape: (35353, 6)
Testing Labels Shape: (35353,)


In [2]:
# Experiment #1

from sklearn.model_selection import RandomizedSearchCV

# Random search cross validation
# this is a method to determine the best hyperparameters to use with the model


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Train the model on training data
rf_random.fit(train_features, train_labels);

# printing the best hyperparameters 
rf_random.best_params_



# predictions = rf.predict(test_features)

# # some metrics to evaluate the model
# print(confusion_matrix(test_labels,predictions))
# print(classification_report(test_labels,predictions))
# print(accuracy_score(test_labels, predictions))


Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 1577,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [7]:
from sklearn import metrics


# using the model with the best hyperparameters found before, to predict the test set
best_rf = rf_random.best_estimator_
pred = best_rf.predict(test_features)

score = metrics.f1_score(test_labels, pred)
print("F1 score :", score)
pscore = metrics.accuracy_score(test_labels, pred)
print("accuracy : ", pscore)

F1 score : 0.6043782441886707
accuracy :  0.8016575679574577


In [3]:
# Experiment #2

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
from numpy import mean

# using a balanced random forest model, that makes up for how imbalanced the dataset is

# define model
model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=20)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model using cross validation
scores = cross_val_score(model, features, labels, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.839


In [4]:
# feature_imp = pd.Series(model.feature_importances_,index=['length','weight','count','looped','neighbors','income']).sort_values(ascending=False)
# feature_imp
# sns.barplot(x=feature_imp, y=feature_imp.index)
# # Add labels to your graph
# plt.xlabel('Feature Importance Score')
# plt.ylabel('Features')
# plt.title("Visualizing Important Features")
# plt.legend()
# plt.savefig("../imgs/Random forest features importance.jpg")
# plt.show()

NotFittedError: This BalancedRandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.