<a href="https://colab.research.google.com/github/Blockchain-Framework/bitcoin-anomaly-analysis/blob/develop/Notebooks/Bitcoin_Binary_Hyperpareameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [3]:
train = pd.read_csv('/content/drive/MyDrive/bitcoin_imbalance_train_set.csv')
test = pd.read_csv("/content/drive/MyDrive/bitcoin_imbalance_test.csv")

In [4]:
original_feature_names = ['input_transaction_count', 'output_transaction_count',
       'input_value_mean', 'input_value_median', 'input_value_maximum',
       'input_value_minimum', 'input_value_25th_percentile',
       'input_value_75th_percentile', 'input_value_range',
       'input_spending_value_usd_mean', 'input_spending_value_usd_median',
       'input_spending_value_usd_maximum', 'input_spending_value_usd_minimum',
       'input_spending_value_usd_25th_percentile',
       'input_spending_value_usd_75th_percentile',
       'input_spending_value_usd_range', 'input_time_diff_mean',
       'input_time_diff_median', 'input_time_diff_maximum',
       'input_time_diff_minimum', 'input_time_diff_25th_percentile',
       'input_time_diff_75th_percentile', 'input_time_diff_range',
       'output_value_usd_mean', 'output_value_usd_median',
       'output_value_usd_maximum', 'output_value_usd_minimum',
       'output_value_usd_25th_percentile', 'output_value_usd_75th_percentile',
       'output_value_usd_range', 'output_time_diff_mean',
       'output_time_diff_median', 'output_time_diff_maximum',
       'output_time_diff_minimum', 'output_time_diff_25th_percentile',
       'output_time_diff_75th_percentile', 'output_time_diff_range',
       'output_value_mean', 'output_value_median', 'output_value_maximum',
       'output_value_minimum', 'output_value_25th_percentile',
       'output_value_75th_percentile', 'output_value_range',
       'input_output_mean_ratio', 'input_output_max_ratio',
       'input_output_min_ratio', 'input_output_percentile_25_diff',
       'input_output_percentile_75_diff', 'range_mean_ratio_input',
       'range_mean_ratio_output', 'input_output_usd_mean_ratio',
       'input_output_usd_max_ratio', 'input_output_usd_min_ratio',
       'input_output_usd_percentile_25_diff',
       'input_output_usd_percentile_75_diff', 'input_range_mean_ratio',
       'output_range_mean_ratio', 'input_output_transaction_count_ratio',
       'input_output_maximum_transaction_value_diff',
       'input_output_minimum_transaction_value_diff',
       'input_output_time_diff_max_min_diff']

In [5]:
X_train = train[original_feature_names]
y_train = train['label']

In [6]:
y_train = y_train.apply(lambda x: 1 if x in [1, 2, 3] else 0)

In [7]:
X_test = test[original_feature_names]
y_test = test['label']

In [8]:
y_test = y_test.apply(lambda x: 1 if x in [1, 2, 3] else 0)

In [13]:
X = X_train.copy()

In [14]:
selector = VarianceThreshold()
X_var = selector.fit_transform(X)

# Calculate correlation matrix
corr_matrix = pd.DataFrame(X_var, columns=X.columns[selector.get_support()]).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop highly correlated features
X_uncorr = pd.DataFrame(X_var, columns=X.columns[selector.get_support()]).drop(to_drop, axis=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [15]:
X_uncorr.head()

Unnamed: 0,input_transaction_count,input_value_mean,input_value_median,input_value_maximum,input_value_range,input_time_diff_mean,input_time_diff_median,input_time_diff_maximum,output_time_diff_mean,output_time_diff_median,...,input_output_min_ratio,input_output_percentile_25_diff,input_output_percentile_75_diff,range_mean_ratio_input,input_output_usd_min_ratio,input_output_usd_percentile_25_diff,input_output_transaction_count_ratio,input_output_maximum_transaction_value_diff,input_output_minimum_transaction_value_diff,input_output_time_diff_max_min_diff
0,5.0,4126654.4,3775772.0,10000000.0,9900000.0,6188034.0,899043.0,24258799.0,8059502.6,51199.0,...,1.0,0.0,0.0,2.399038,0.970265,165.8845,1.0,0.0,0.0,-15775957.0
1,1.0,139863.0,139863.0,139863.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.987312,-0.1896,1.0,0.0,0.0,0.0
2,1.0,4600000.0,4600000.0,4600000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.91157,-42.7983,1.0,0.0,0.0,0.0
3,1.0,2538940.0,2538940.0,2538940.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.395747,69.2866,1.0,0.0,0.0,0.0
4,1.0,739047.0,739047.0,739047.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0448,3.3858,1.0,0.0,0.0,0.0


In [16]:
X_uncorr.columns

Index(['input_transaction_count', 'input_value_mean', 'input_value_median',
       'input_value_maximum', 'input_value_range', 'input_time_diff_mean',
       'input_time_diff_median', 'input_time_diff_maximum',
       'output_time_diff_mean', 'output_time_diff_median',
       'output_time_diff_maximum', 'input_output_mean_ratio',
       'input_output_max_ratio', 'input_output_min_ratio',
       'input_output_percentile_25_diff', 'input_output_percentile_75_diff',
       'range_mean_ratio_input', 'input_output_usd_min_ratio',
       'input_output_usd_percentile_25_diff',
       'input_output_transaction_count_ratio',
       'input_output_maximum_transaction_value_diff',
       'input_output_minimum_transaction_value_diff',
       'input_output_time_diff_max_min_diff'],
      dtype='object')

In [17]:
X_train = X_train[X_uncorr.columns]
X_test = X_test[X_uncorr.columns]

In [19]:
train['label'].value_counts()

0    40001
2    13757
3    11228
1     7823
Name: label, dtype: int64

In [18]:
y_train.value_counts()

0    40001
1    32808
Name: label, dtype: int64

## Hyperparamter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.preprocessing import StandardScaler

pipeline_rf_up = make_pipeline_imb(
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)


# Define the parameter grid
param_grid_rf = {
    'randomforestclassifier__n_estimators': [100, 200, 300, 500],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__max_depth': [10, 20, 30, 40, 50, None],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False],
    'randomforestclassifier__criterion': ['gini', 'entropy']
}

# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    pipeline_rf_up,
    param_distributions=param_grid_rf,
    n_iter=100,  # Number of parameter settings sampled. Increase or decrease based on computational resource.
    cv=5,
    scoring='average_precision',
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use parallel computation. Set to -1 to use all available cores.
)

# Fit RandomizedSearchCV to the training data
random_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search_rf.best_params_)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.preprocessing import StandardScaler

pipeline_svm_up = make_pipeline_imb(
    StandardScaler(),
    SVC(random_state=42)
)

# Define the parameter grid
param_grid_svm = {
    'svc__C': [0.1, 1, 10, 100, 1000],
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10, 100],
    'svc__kernel': ['rbf', 'poly', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],  # Only used for 'poly' kernel
    'svc__coef0': [0.0, 0.5, 1.0]  # Only significant for 'poly' and 'sigmoid' kernels
}


# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    pipeline_svm_up,
    param_distributions=param_grid_svm,
    n_iter=100,  # Number of parameter settings sampled. Increase or decrease based on computational resource.
    cv=5,
    scoring='average_precision',
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use parallel computation. Set to -1 to use all available cores.
)

# Fit RandomizedSearchCV to the training data
random_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search_rf.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.preprocessing import StandardScaler

pipeline_xgb_up = make_pipeline_imb(
    StandardScaler(),
    xgb.XGBClassifier(random_state=42)
)

# Define the parameter grid
param_grid_xgb = {
    'xgbclassifier__n_estimators': [100, 200, 300, 500],
    'xgbclassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgbclassifier__max_depth': [3, 4, 5, 6, 7, 8],
    'xgbclassifier__min_child_weight': [1, 2, 5, 10],
    'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'xgbclassifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbclassifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbclassifier__reg_alpha': [0, 0.1, 0.5, 1],
    'xgbclassifier__reg_lambda': [1, 0.1, 0.5, 0.01]
}



# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    pipeline_xgb_up,
    param_distributions=param_grid_xgb,
    n_iter=100,  # Number of parameter settings sampled. Increase or decrease based on computational resource.
    cv=5,
    scoring='average_precision',
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use parallel computation. Set to -1 to use all available cores.
)

# Fit RandomizedSearchCV to the training data
random_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search_rf.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.preprocessing import StandardScaler

pipeline_knn_up = make_pipeline_imb(
    StandardScaler(),
    KNeighborsClassifier()
)

# Define the parameter grid
param_grid_knn = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9, 11, 15],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'kneighborsclassifier__leaf_size': [10, 30, 50, 70],
    'kneighborsclassifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}




# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    pipeline_knn_up,
    param_distributions=param_grid_knn,
    n_iter=100,  # Number of parameter settings sampled. Increase or decrease based on computational resource.
    cv=5,
    scoring='average_precision',
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use parallel computation. Set to -1 to use all available cores.
)

# Fit RandomizedSearchCV to the training data
random_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search_rf.best_params_)