In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries and Datasets

In [2]:
# Data preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from google.colab import files

# Modelling
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split,cross_val_predict, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from scipy.stats import randint

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
data = pd.read_csv('exchangedata.csv') # link to exchange data csv file

## Data Preprocessing

In [None]:
data.head()

Unnamed: 0,domain,trust_score,blacklisted,ip_bad_count_bad,ssl_valid,ip_country,anonymous_whois,dnsblock_threat_count,whois_registration_duration,whois_valid_email,avg_review_score,facebook_comments_negative_count,facebook_comments_positive_count,scamadviser_votes_legit,scamadviser_votes_scam,scamadviser_votes_fake,is_legit
0,merlinswap.org,56.0,0,0.29,1,US,1,0,0.53,0,0.0,0.0,0.0,0,0,0,1
1,fatbtc.com,1.0,0,0.03,1,US,1,0,10.51,0,1.49,0.0,0.0,0,0,0,1
2,camelot.exchange,12.0,0,0.26,1,US,1,0,1.82,0,0.0,0.0,0.0,0,0,0,1
3,fast.exchange.onetrading.com,80.0,0,0.0,1,US,1,0,20.05,1,0.0,0.0,0.0,0,0,0,1
4,dackieswap.xyz,1.0,0,0.41,1,US,1,0,1.28,0,1.0,0.0,0.0,0,0,0,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084 entries, 0 to 1083
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   domain                            1064 non-null   object 
 1   trust_score                       1084 non-null   float64
 2   blacklisted                       1084 non-null   int64  
 3   ip_bad_count_bad                  1084 non-null   float64
 4   ssl_valid                         1084 non-null   int64  
 5   ip_country                        1084 non-null   object 
 6   anonymous_whois                   1084 non-null   int64  
 7   dnsblock_threat_count             1084 non-null   int64  
 8   whois_registration_duration       1084 non-null   float64
 9   whois_valid_email                 1084 non-null   int64  
 10  avg_review_score                  1084 non-null   float64
 11  facebook_comments_negative_count  1084 non-null   float64
 12  facebo

In [None]:
data.isnull().sum()

domain                              20
trust_score                          0
blacklisted                          0
ip_bad_count_bad                     0
ssl_valid                            0
ip_country                           0
anonymous_whois                      0
dnsblock_threat_count                0
whois_registration_duration          0
whois_valid_email                    0
avg_review_score                     0
facebook_comments_negative_count     0
facebook_comments_positive_count     0
scamadviser_votes_legit              0
scamadviser_votes_scam               0
scamadviser_votes_fake               0
is_legit                             0
dtype: int64

In [5]:
# drop domain name
data.drop('domain', axis=1, inplace=True)

In [6]:
class_count = data['is_legit'].value_counts()
min_count = class_count.min()
max_count = class_count.max()
print(min_count/max_count)

# no imbalance detected

0.8466780238500852


### Feature engineering
- Converting categorical data
- Scaling and nomalising


In [7]:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(data[['ip_country']])

# convert encoded data to dataframe
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['ip_country']))
e_data = pd.concat([data, encoded_df], axis=1)
e_data.drop('ip_country', axis=1, inplace=True)



In [8]:
# Scaling numerical features
scaler = StandardScaler()
numerical = ['trust_score', 'ip_bad_count_bad', 'dnsblock_threat_count', 'whois_registration_duration', 'avg_review_score',
             'facebook_comments_negative_count', 'facebook_comments_positive_count', 'scamadviser_votes_legit', 'scamadviser_votes_scam',
             'scamadviser_votes_fake']
scaler.fit_transform(e_data[numerical])

array([[ 0.31729603,  0.47005701, -0.28159548, ..., -0.22367834,
        -0.22878307, -0.22103199],
       [-1.1499027 , -0.62139849, -0.28159548, ..., -0.22367834,
        -0.22878307, -0.22103199],
       [-0.85646295,  0.34411984, -0.28159548, ..., -0.22367834,
        -0.22878307, -0.22103199],
       ...,
       [-0.4296415 , -0.74733566, -0.28159548, ..., -0.22367834,
        -0.22878307, -0.22103199],
       [ 1.49105501, -0.57941943, -0.28159548, ...,  4.80676409,
         5.77633513,  5.69802945],
       [-0.82978661,  0.3860989 , -0.28159548, ..., -0.22367834,
        -0.22878307, -0.22103199]])

## Model params and feature selection

In [9]:
rf = RandomForestClassifier(n_estimators = 10,
                            max_features = int(np.sqrt(17)),
                            max_depth = None,
                            min_samples_split = 2,
                            bootstrap = True )

lgbm = lgb.LGBMClassifier(random_state=42,
                          learning_rate = 0.1,
                          max_depth = 7,
                          n_estimators = 100,
                          num_leaves=31)

In [10]:
# building the soft ensemble
soft_ensemble = VotingClassifier(
    estimators=[('rf', rf), ('lgbm', lgbm)],
    voting='soft'
)

In [11]:
# Splitting into features (X) and target (y)
X = e_data.drop('is_legit', axis=1)
y = e_data['is_legit']

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084 entries, 0 to 1083
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   trust_score                       1084 non-null   float64
 1   blacklisted                       1084 non-null   int64  
 2   ip_bad_count_bad                  1084 non-null   float64
 3   ssl_valid                         1084 non-null   int64  
 4   anonymous_whois                   1084 non-null   int64  
 5   dnsblock_threat_count             1084 non-null   int64  
 6   whois_registration_duration       1084 non-null   float64
 7   whois_valid_email                 1084 non-null   int64  
 8   avg_review_score                  1084 non-null   float64
 9   facebook_comments_negative_count  1084 non-null   float64
 10  facebook_comments_positive_count  1084 non-null   float64
 11  scamadviser_votes_legit           1084 non-null   int64  
 12  scamad

In [None]:
# define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True)

# Train & evaluate Model

## Experiment with all features

In [None]:
# Evaluating model perfomance using k-fold cross validation to test the performance
# soft ensemble
y_pred_all = cross_val_predict(soft_ensemble, X, y, cv=kf, method='predict')
y_pred_proba_all = cross_val_predict(soft_ensemble, X, y, cv=kf, method='predict_proba')[:, 1]

accuracy_scores = cross_val_score(soft_ensemble, X, y, cv=kf, scoring='accuracy')
precision_scores = cross_val_score(soft_ensemble, X, y, cv=kf, scoring='precision')
recall_scores = cross_val_score(soft_ensemble, X, y, cv=kf, scoring='recall')
f1_scores = cross_val_score(soft_ensemble, X, y, cv=kf, scoring='f1')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 445, number of negative: 531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 976, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.455943 -> initscore=-0.176688
[LightGBM] [Info] Start training from score -0.176688
[LightGBM] [Info] Number of positive: 448, number of negative: 528
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 536
[LightGBM] [Info] Number of data points in the train set: 

In [None]:
# Print average metrics
print("Avg accuracy: ", np.mean(accuracy_scores), "s.d: ", np.std(accuracy_scores))
print("Avg precision: ", np.mean(precision_scores), "s.d: ", np.std(precision_scores))
print("Avg recall: ", np.mean(recall_scores), "s.d: ", np.std(recall_scores))
print("Avg f1: ", np.mean(f1_scores), "s.d: ", np.std(f1_scores))

Avg accuracy:  0.9621984369690791 s.d:  0.016706399941391826
Avg precision:  0.9751722748729696 s.d:  0.007742206397486681
Avg recall:  0.9491992983111572 s.d:  0.04118944155987024
Avg f1:  0.9515301230612199 s.d:  0.034223758029103366


## Experiment with selected features

In [33]:
# Select features
features = ['whois_registration_duration','trust_score','avg_review_score','ip_bad_count_bad',
            'scamadviser_votes_scam', 'ssl_valid', 'whois_valid_email','scamadviser_votes_legit',
            'facebook_comments_negative_count','anonymous_whois','dnsblock_threat_count','scamadviser_votes_fake' ]

In [34]:
# selecting important features only
selected_X = X[features]

In [None]:
# Evaluating model perfomance using k-fold cross validation to test the performance
# soft ensemble
y_pred_all = cross_val_predict(soft_ensemble, selected_X, y, cv=kf, method='predict')
y_pred_proba_all = cross_val_predict(soft_ensemble, selected_X, y, cv=kf, method='predict_proba')[:, 1]

sel_accuracy_scores = cross_val_score(soft_ensemble, selected_X, y, cv=kf, scoring='accuracy')
sel_precision_scores = cross_val_score(soft_ensemble, selected_X, y, cv=kf, scoring='precision')
sel_recall_scores = cross_val_score(soft_ensemble, selected_X, y, cv=kf, scoring='recall')
sel_f1_scores = cross_val_score(soft_ensemble, selected_X, y, cv=kf, scoring='f1')

In [None]:
# Print average metrics
print("Avg accuracy: ", np.mean(sel_accuracy_scores), "s.d: ", np.std(sel_accuracy_scores))
print("Avg precision: ", np.mean(sel_precision_scores), "s.d: ", np.std(sel_precision_scores))
print("Avg recall: ", np.mean(sel_recall_scores), "s.d: ", np.std(sel_recall_scores))
print("Avg f1: ", np.mean(sel_f1_scores), "s.d: ", np.std(sel_f1_scores))

In [None]:
# Compare the results
print(f"Improvement in accuracy: {sel_accuracy_scores.mean() - accuracy_scores.mean():.4f}")
print(f"Improvement in precision: {sel_precision_scores.mean() - precision_scores.mean():.4f}")
print(f"Improvement in recall: {sel_recall_scores.mean() - recall_scores.mean():.4f}")
print(f"Improvement in F1-score: {sel_f1_scores.mean() - f1_scores.mean():.4f}")

In [None]:
from scipy import stats

def test(score1, score2):
  # Perform paired t-test
  t_stat, p_value = stats.ttest_rel(score1, score2)

  print(f'Paired t-test statistic: {t_stat:.4f}')
  print(f'P-value: {p_value:.4f}')

  if p_value < 0.05:
      print("The difference between the models with all features and selected features is statistically significant.")
  else:
      print("The difference between the models with all features and selected features is not statistically significant.")


In [None]:
print("Accuracy - ", end = " ")
test(accuracy_scores, sel_accuracy_scores)

print("Precision - ", end = " ")
test(precision_scores, sel_precision_scores)

print("Recall - ", end = " ")
test(recall_scores, sel_recall_scores)

print("F1 score - ", end = " ")
test(f1_scores, sel_f1_scores)

In [None]:
# roc function

def plot_roc_curve(y_true, y_scores, title):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

    data_filename = title+'.csv'
    # Save the ROC curve data
    roc_data = pd.DataFrame({'fpr': fpr, 'tpr': tpr})
    roc_data.to_csv(data_filename, index=False)

    # Download the files to local machine
    files.download(data_filename)


In [None]:
# ROC curve for all features
plot_roc_curve(y, y_pred_proba_all, 'RF ROC Curve with All Features')

# ROC curve for important features
plot_roc_curve(y, y_pred_proba_important, 'RF ROC Curve with Important Features')


# Saving model for later use

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X[selected_X.columns], y, test_size=0.2, random_state=42, stratify=y)

In [42]:
import joblib

soft_ensemble.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 398, number of negative: 469
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 498
[LightGBM] [Info] Number of data points in the train set: 867, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.459054 -> initscore=-0.164151
[LightGBM] [Info] Start training from score -0.164151


In [43]:
# saving model

joblib.dump(soft_ensemble, 'ensemble.pkl')

['/content/drive/My Drive/ensemble.pkl']