In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split

In [None]:
df_normt = pd.read_csv('transactions_norm_standardscaler.csv')

In [None]:
df_normt.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Laundering_type,Is_laundering
0,-0.716449,-1.0,1.0,-0.775333,-0.285139,0.171435,0.23455,0.156459,0.226096,-1.0,-1.0,0
1,-0.7164,-1.0,-1.0,1.0,-0.107099,0.171435,-1.0,0.156459,-0.191672,0.793335,-0.678404,0
2,-0.7164,-1.0,-1.0,-0.208419,0.217274,0.171435,0.23455,0.156459,0.226096,-0.156956,1.0,0
3,-0.716352,-1.0,0.128225,1.0,0.122274,0.171435,0.23455,0.156459,0.226096,-1.0,-0.989758,0
4,-0.716352,-1.0,1.0,-0.416904,-0.337604,0.171435,0.23455,0.156459,0.226096,-1.0,-1.0,0


In [None]:
df_normt.shape

(9504852, 12)

In [None]:
# Using Conditional  List Comphersion to select the feature columns

feature_cols = [item for item in df_normt.columns if item != 'Is_laundering']
feature_cols

['Time',
 'Date',
 'Sender_account',
 'Receiver_account',
 'Amount',
 'Payment_currency',
 'Received_currency',
 'Sender_bank_location',
 'Receiver_bank_location',
 'Payment_type',
 'Laundering_type']

In [None]:
# Applying SMOTE to balance the dataset

from imblearn.over_sampling import SMOTE

X= df_normt[feature_cols]
y= df_normt['Is_laundering']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Spliting data into training and testing samples

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=0)

In [None]:
# Loading the RF Blended model

import joblib

rf_loaded = joblib.load('random_forest_model.pkl')

In [None]:
#fit the classifier to the data
rf_loaded_model = rf_loaded.fit(X_train, y_train)

#use the resulting model to predict value on the test set
rf_loaded_predictions = rf_loaded.predict(X_test)


#performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, rf_loaded_predictions), '\n')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_loaded_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_loaded_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_loaded_predictions)), '\n')

print(classification_report(y_test, rf_loaded_predictions))
print("Accuracy:", accuracy_score(y_test, rf_loaded_predictions))
print("AUC:", roc_auc_score(y_test, rf_loaded_predictions))

[[2366612    6885]
 [   1061 2372932]] 

Mean Absolute Error: 0.001673726537601975
Mean Squared Error: 0.001673726537601975
Root Mean Squared Error: 0.040911203081820696 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2373497
           1       1.00      1.00      1.00   2373993

    accuracy                           1.00   4747490
   macro avg       1.00      1.00      1.00   4747490
weighted avg       1.00      1.00      1.00   4747490

Accuracy: 0.998326273462398
AUC: 0.9983261452775092


In [None]:
#performance metrics

from sklearn.model_selection import cross_val_predict

#predicted values
y_pred = cross_val_predict(rf_loaded_model, X_resampled, y_resampled, cv=5)

print(confusion_matrix(y_resampled, y_pred), '\n')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_resampled, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_resampled, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_resampled, y_pred)), '\n')

print(classification_report(y_resampled, y_pred))
print("Accuracy:", accuracy_score(y_resampled, y_pred))
print("AUC:", roc_auc_score(y_resampled, y_pred))

[[8206084 1288895]
 [   4174 9490805]] 

Mean Absolute Error: 0.0680922517048221
Mean Squared Error: 0.0680922517048221
Root Mean Squared Error: 0.26094492082587484 

              precision    recall  f1-score   support

           0       1.00      0.86      0.93   9494979
           1       0.88      1.00      0.94   9494979

    accuracy                           0.93  18989958
   macro avg       0.94      0.93      0.93  18989958
weighted avg       0.94      0.93      0.93  18989958

Accuracy: 0.9319077482951779
AUC: 0.9319077482951779


In [None]:

# Save predictions as a CSV file
pd.DataFrame(rf_loaded_predictions, columns=['Predictions- BlendingRF_M']).to_csv('Predictions- BlendingRF_M.csv', index=False)

In [None]:
rf_loaded_predictions.shape

(4747490,)

In [None]:
X_test.shape

(4747490, 11)

In [None]:
X_test.keys()

Index(['Time', 'Date', 'Sender_account', 'Receiver_account', 'Amount',
       'Payment_currency', 'Received_currency', 'Sender_bank_location',
       'Receiver_bank_location', 'Payment_type', 'Laundering_type'],
      dtype='object')

In [None]:
y_test.shape

(4747490,)

In [None]:
y_test.value_counts()

1    2373993
0    2373497
Name: Is_laundering, dtype: int64

In [None]:
y_test_series = pd.Series(y_test, name='Actual Is_laundering').reset_index(drop=True)
rf_predictions_series = pd.Series(rf_loaded_predictions, name='Predicted BlendingRF_M').reset_index(drop=True)
# Reset the index of X_test
X_test_reset = X_test.reset_index(drop=True)

combined_df = pd.concat([X_test_reset, y_test_series, rf_predictions_series], axis=1)

In [None]:
combined_df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Laundering_type,Actual Is_laundering,Predicted BlendingRF_M
0,0.22845,-0.815028,-0.932483,-1.0,-0.332644,0.171435,0.23455,0.156459,0.226096,0.318189,1.0,0,0
1,0.555261,-0.738835,-1.0,-0.769014,-0.339397,0.171435,0.23455,0.156459,0.226096,-1.0,1.0,0,0
2,0.687549,1.0,1.0,0.393095,0.295608,0.171435,0.23455,0.156459,0.226096,-0.156956,-0.989758,0,0
3,1.0,0.623569,-0.027022,-0.506008,-0.229645,0.171435,0.23455,0.156459,0.226096,0.318189,1.0,1,1
4,0.642498,-0.640871,1.0,-0.730042,0.12814,0.171435,0.23455,0.156459,0.226096,-1.0,-0.989758,0,0


In [None]:
combined_df.notnull().sum()

Time                      4747490
Date                      4747490
Sender_account            4747490
Receiver_account          4747490
Amount                    4747490
Payment_currency          4747490
Received_currency         4747490
Sender_bank_location      4747490
Receiver_bank_location    4747490
Payment_type              4747490
Laundering_type           4747490
Actual Is_laundering      4747490
Predicted BlendingRF_M    4747490
dtype: int64

In [None]:
combined_df.isnull().sum()

Time                      0
Date                      0
Sender_account            0
Receiver_account          0
Amount                    0
Payment_currency          0
Received_currency         0
Sender_bank_location      0
Receiver_bank_location    0
Payment_type              0
Laundering_type           0
Actual Is_laundering      0
Predicted BlendingRF_M    0
dtype: int64

In [None]:
combined_df.shape

(4747490, 13)

In [None]:
# Save the DataFrame to a CSV file
combined_df.to_csv('combined_data_BlendingRFpreds.csv', index=False)

In [None]:
pip install yellowbrick

Defaulting to user installation because normal site-packages is not writeable
Collecting yellowbrick
  Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
     |████████████████████████████████| 282 kB 9.5 MB/s            
  Downloading yellowbrick-1.4-py3-none-any.whl (274 kB)
     |████████████████████████████████| 274 kB 41.2 MB/s            
[?25h  Downloading yellowbrick-1.3.post1-py3-none-any.whl (271 kB)
     |████████████████████████████████| 271 kB 36.9 MB/s            
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.3.post1
Note: you may need to restart the kernel to use updated packages.
