In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chi2_contingency

In [None]:
li_data = pd.read_csv('LI-Small_Trans.csv')

In [None]:
li_data.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:08,11,8000ECA90,11,8000ECA90,3195403.0,US Dollar,3195403.0,US Dollar,Reinvestment,0
1,2022/09/01 00:21,3402,80021DAD0,3402,80021DAD0,1858.96,US Dollar,1858.96,US Dollar,Reinvestment,0
2,2022/09/01 00:00,11,8000ECA90,1120,8006AA910,592571.0,US Dollar,592571.0,US Dollar,Cheque,0
3,2022/09/01 00:16,3814,8006AD080,3814,8006AD080,12.32,US Dollar,12.32,US Dollar,Reinvestment,0
4,2022/09/01 00:00,20,8006AD530,20,8006AD530,2941.56,US Dollar,2941.56,US Dollar,Reinvestment,0


In [None]:
li_data.shape

(6924049, 11)

In [None]:
li_data.notnull().sum()

Timestamp             6924049
From Bank             6924049
Account               6924049
To Bank               6924049
Account.1             6924049
Amount Received       6924049
Receiving Currency    6924049
Amount Paid           6924049
Payment Currency      6924049
Payment Format        6924049
Is Laundering         6924049
dtype: int64

In [None]:
li_data.isnull().sum()

Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64

In [None]:
li_data.keys()

Index(['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering'],
      dtype='object')

In [None]:
li_data.dtypes

Timestamp              object
From Bank               int64
Account                object
To Bank                 int64
Account.1              object
Amount Received       float64
Receiving Currency     object
Amount Paid           float64
Payment Currency       object
Payment Format         object
Is Laundering           int64
dtype: object

In [None]:
li_data['Timestamp'].describe()

count              6924049
unique               14533
top       2022/09/01 00:22
freq                 15221
Name: Timestamp, dtype: object

In [None]:
li_data['Timestamp'].value_counts()

2022/09/01 00:22    15221
2022/09/01 00:20    15070
2022/09/01 00:01    15062
2022/09/01 00:21    15061
2022/09/01 00:14    15049
                    ...  
2022/09/13 16:54        1
2022/09/13 01:45        1
2022/09/11 14:17        1
2022/09/11 11:31        1
2022/09/14 10:53        1
Name: Timestamp, Length: 14533, dtype: int64

In [None]:
# Convert the 'Timestamp' column to datetime format
li_data['Timestamp'] = pd.to_datetime(li_data['Timestamp'], format='%Y/%m/%d %H:%M:%S')

In [None]:
# Extract the date and time (including seconds) into separate columns
li_data['Date'] = li_data['Timestamp'].dt.date
li_data['Time'] = li_data['Timestamp'].dt.time

In [None]:
li_data.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,Date,Time
0,2022-09-01 00:08:00,11,8000ECA90,11,8000ECA90,3195403.0,US Dollar,3195403.0,US Dollar,Reinvestment,0,2022-09-01,00:08:00
1,2022-09-01 00:21:00,3402,80021DAD0,3402,80021DAD0,1858.96,US Dollar,1858.96,US Dollar,Reinvestment,0,2022-09-01,00:21:00
2,2022-09-01 00:00:00,11,8000ECA90,1120,8006AA910,592571.0,US Dollar,592571.0,US Dollar,Cheque,0,2022-09-01,00:00:00
3,2022-09-01 00:16:00,3814,8006AD080,3814,8006AD080,12.32,US Dollar,12.32,US Dollar,Reinvestment,0,2022-09-01,00:16:00
4,2022-09-01 00:00:00,20,8006AD530,20,8006AD530,2941.56,US Dollar,2941.56,US Dollar,Reinvestment,0,2022-09-01,00:00:00


In [None]:
# Converting From Bank and To Bank to string

bank_columns = ['From Bank', 'To Bank']

for bank in bank_columns:
    li_data[bank] = li_data[bank].astype(str)

In [None]:
li_data.dtypes

Timestamp             datetime64[ns]
From Bank                     object
Account                       object
To Bank                       object
Account.1                     object
Amount Received              float64
Receiving Currency            object
Amount Paid                  float64
Payment Currency              object
Payment Format                object
Is Laundering                  int64
Date                          object
Time                          object
dtype: object

In [None]:
li_data['Is Laundering'].describe()

count    6.924049e+06
mean     5.148722e-04
std      2.268495e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: Is Laundering, dtype: float64

In [None]:
li_data['Is Laundering'].value_counts()

0    6920484
1       3565
Name: Is Laundering, dtype: int64

In [None]:
#  Convert 'Date' to a numerical format
li_data['Date'] = pd.to_datetime(li_data['Date'])
li_data['Date'] = li_data['Date'].map(pd.Timestamp.timestamp).astype(int)

In [None]:
# If 'Time' is in a datetime format or string, convert to seconds since start of the day
li_data['Time'] = pd.to_datetime(li_data['Time'], format='%H:%M:%S').dt.time.apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)

In [None]:
li_data.dtypes

Timestamp             datetime64[ns]
From Bank                     object
Account                       object
To Bank                       object
Account.1                     object
Amount Received              float64
Receiving Currency            object
Amount Paid                  float64
Payment Currency              object
Payment Format                object
Is Laundering                  int64
Date                           int64
Time                           int64
dtype: object

In [None]:
li_data.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,Date,Time
0,2022-09-01 00:08:00,11,8000ECA90,11,8000ECA90,3195403.0,US Dollar,3195403.0,US Dollar,Reinvestment,0,1661990400,480
1,2022-09-01 00:21:00,3402,80021DAD0,3402,80021DAD0,1858.96,US Dollar,1858.96,US Dollar,Reinvestment,0,1661990400,1260
2,2022-09-01 00:00:00,11,8000ECA90,1120,8006AA910,592571.0,US Dollar,592571.0,US Dollar,Cheque,0,1661990400,0
3,2022-09-01 00:16:00,3814,8006AD080,3814,8006AD080,12.32,US Dollar,12.32,US Dollar,Reinvestment,0,1661990400,960
4,2022-09-01 00:00:00,20,8006AD530,20,8006AD530,2941.56,US Dollar,2941.56,US Dollar,Reinvestment,0,1661990400,0


In [None]:
li_data['Date'].describe()

count    6.924049e+06
mean     1.662322e+09
std      2.656369e+05
min      1.661990e+09
25%      1.662077e+09
50%      1.662336e+09
75%      1.662595e+09
max      1.663373e+09
Name: Date, dtype: float64

In [None]:
selected_columns = ['Time','Date','From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering']

li_data[selected_columns].head()

Unnamed: 0,Time,Date,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,480,1661990400,11,8000ECA90,11,8000ECA90,3195403.0,US Dollar,3195403.0,US Dollar,Reinvestment,0
1,1260,1661990400,3402,80021DAD0,3402,80021DAD0,1858.96,US Dollar,1858.96,US Dollar,Reinvestment,0
2,0,1661990400,11,8000ECA90,1120,8006AA910,592571.0,US Dollar,592571.0,US Dollar,Cheque,0
3,960,1661990400,3814,8006AD080,3814,8006AD080,12.32,US Dollar,12.32,US Dollar,Reinvestment,0
4,0,1661990400,20,8006AD530,20,8006AD530,2941.56,US Dollar,2941.56,US Dollar,Reinvestment,0


In [None]:
li_data[selected_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6924049 entries, 0 to 6924048
Data columns (total 12 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Time                int64  
 1   Date                int64  
 2   From Bank           object 
 3   Account             object 
 4   To Bank             object 
 5   Account.1           object 
 6   Amount Received     float64
 7   Receiving Currency  object 
 8   Amount Paid         float64
 9   Payment Currency    object 
 10  Payment Format      object 
 11  Is Laundering       int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 633.9+ MB


In [None]:
# Converting Categorical Data into numercial data
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

columns_to_encode = ['From Bank','Account', 'To Bank', 'Account.1', 'Receiving Currency', 'Payment Currency','Payment Format']

for column in columns_to_encode:
    li_data[column] = LE.fit_transform(li_data[column])

In [None]:
df_t = li_data[selected_columns]
df_t.head()

Unnamed: 0,Time,Date,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,480,1661990400,12,950,12,900,3195403.0,12,3195403.0,12,5,0
1,1260,1661990400,19417,2772,10968,2446,1858.96,12,1858.96,12,5,0
2,0,1661990400,12,950,36,8407,592571.0,12,592571.0,12,3,0
3,960,1661990400,40281,9998,20531,8419,12.32,12,12.32,12,5,0
4,0,1661990400,901,10001,901,8422,2941.56,12,2941.56,12,5,0


In [None]:
# Applying Standard Scaler for Normalisation

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

columns_to_norm = ['Time','Date','From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format']


# Apply the StandardScaler to each column individually
for column in columns_to_norm:
    # Apply StandardScaler to each column
    df_t[column] = scaler.fit_transform(df_t[[column]])

    # Clip values to ensure they are within the range [-1, 1]
    df_t[column] = np.clip(df_t[column], -1, 1)

df_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See t

Unnamed: 0,Time,Date,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,-1.0,-1.0,-0.642915,-1.0,-0.578747,-1.0,-0.001486,0.86201,-0.000959,0.855874,1.000000,0
1,-1.0,-1.0,0.486825,-1.0,0.849407,-1.0,-0.003003,0.86201,-0.003027,0.855874,1.000000,0
2,-1.0,-1.0,-0.642915,-1.0,-0.575619,-1.0,-0.002722,0.86201,-0.002645,0.855874,-0.009958,0
3,-1.0,-1.0,1.000000,-1.0,1.000000,-1.0,-0.003004,0.86201,-0.003028,0.855874,1.000000,0
4,-1.0,-1.0,-0.591159,-1.0,-0.462863,-1.0,-0.003002,0.86201,-0.003026,0.855874,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6924044,1.0,1.0,1.000000,1.0,1.000000,1.0,-0.003004,-1.00000,-0.003028,-1.000000,-1.000000,0
6924045,1.0,1.0,-0.546330,1.0,-0.473552,1.0,-0.003004,-1.00000,-0.003028,-1.000000,-1.000000,0
6924046,1.0,1.0,-0.546330,1.0,-0.473552,1.0,-0.003004,-1.00000,-0.003028,-1.000000,-1.000000,0
6924047,1.0,1.0,-0.598145,1.0,-0.355060,1.0,-0.003004,-1.00000,-0.003028,-1.000000,-1.000000,0


In [None]:
# Save the DataFrame 'df_t' to a CSV file
df_t.to_csv('LI_IBM_norm_Standard_scaler.csv', index=False)

In [None]:
df_normt = pd.read_csv('LI_IBM_norm_Standard_scaler.csv')
df_normt.head()

Unnamed: 0,Time,Date,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,-1.0,-1.0,-0.642915,-1.0,-0.578747,-1.0,-0.001486,0.86201,-0.000959,0.855874,1.0,0
1,-1.0,-1.0,0.486825,-1.0,0.849407,-1.0,-0.003003,0.86201,-0.003027,0.855874,1.0,0
2,-1.0,-1.0,-0.642915,-1.0,-0.575619,-1.0,-0.002722,0.86201,-0.002645,0.855874,-0.009958,0
3,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-0.003004,0.86201,-0.003028,0.855874,1.0,0
4,-1.0,-1.0,-0.591159,-1.0,-0.462863,-1.0,-0.003002,0.86201,-0.003026,0.855874,1.0,0


In [None]:
df_normt.shape

(6924049, 12)

In [None]:
# Using Conditional  List Comphersion to select the feature columns

feature_cols = [item for item in df_normt.columns if item != 'Is Laundering']
feature_cols

['Time',
 'Date',
 'From Bank',
 'Account',
 'To Bank',
 'Account.1',
 'Amount Received',
 'Receiving Currency',
 'Amount Paid',
 'Payment Currency',
 'Payment Format']

In [None]:
# Applying SMOTE to balance the dataset

from imblearn.over_sampling import SMOTE

X= df_normt[feature_cols]
y= df_normt['Is Laundering']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

1    6920484
0    6920484
Name: Is Laundering, dtype: int64

In [None]:
# Spliting data into training and testing samples

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=0)

In [None]:
# Loading the RF Blended model

import joblib

rf_loaded = joblib.load('random_forest_model.pkl')

In [None]:
#fit the classifier to the data
rf_loaded_model = rf_loaded.fit(X_train, y_train)

#use the resulting model to predict value on the test set
rf_loaded_predictions_IBM = rf_loaded.predict(X_test)


#performance metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print(confusion_matrix(y_test, rf_loaded_predictions_IBM), '\n')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_loaded_predictions_IBM))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_loaded_predictions_IBM))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_loaded_predictions_IBM)), '\n')

print(classification_report(y_test, rf_loaded_predictions_IBM))
print("Accuracy:", accuracy_score(y_test, rf_loaded_predictions_IBM))
print("AUC:", roc_auc_score(y_test, rf_loaded_predictions_IBM))

[[1726868    3387]
 [   1308 1728679]] 

Mean Absolute Error: 0.0013568415157090168
Mean Squared Error: 0.0013568415157090168
Root Mean Squared Error: 0.03683532972173613 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1730255
           1       1.00      1.00      1.00   1729987

    accuracy                           1.00   3460242
   macro avg       1.00      1.00      1.00   3460242
weighted avg       1.00      1.00      1.00   3460242

Accuracy: 0.998643158484291
AUC: 0.9986432050107897


In [None]:
#performance metrics

from sklearn.model_selection import cross_val_predict

#predicted values
y_pred_IBM = cross_val_predict(rf_loaded_model, X_resampled, y_resampled, cv=5)

print(confusion_matrix(y_resampled, y_pred_IBM), '\n')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_resampled, y_pred_IBM))
print('Mean Squared Error:', metrics.mean_squared_error(y_resampled, y_pred_IBM))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_resampled, y_pred_IBM)), '\n')

print(classification_report(y_resampled, y_pred_IBM))
print("Accuracy:", accuracy_score(y_resampled, y_pred_IBM))
print("AUC:", roc_auc_score(y_resampled, y_pred_IBM))

NameError: name 'y_pred' is not defined

In [None]:
print(confusion_matrix(y_resampled, y_pred_IBM), '\n')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_resampled, y_pred_IBM))
print('Mean Squared Error:', metrics.mean_squared_error(y_resampled, y_pred_IBM))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_resampled, y_pred_IBM)), '\n')

print(classification_report(y_resampled, y_pred_IBM))
print("Accuracy:", accuracy_score(y_resampled, y_pred_IBM))
print("AUC:", roc_auc_score(y_resampled, y_pred_IBM))

[[5490507 1429977]
 [   4979 6915505]] 

Mean Absolute Error: 0.1036745406824147
Mean Squared Error: 0.1036745406824147
Root Mean Squared Error: 0.32198531128362784 

              precision    recall  f1-score   support

           0       1.00      0.79      0.88   6920484
           1       0.83      1.00      0.91   6920484

    accuracy                           0.90  13840968
   macro avg       0.91      0.90      0.90  13840968
weighted avg       0.91      0.90      0.90  13840968

Accuracy: 0.8963254593175853
AUC: 0.8963254593175852


In [None]:

# Save predictions as a CSV file
pd.DataFrame(rf_loaded_predictions_IBM, columns=['Predictions- BlendingRF_IBM']).to_csv('Predictions- BlendingRF_IBM.csv', index=False)

In [None]:

# Save predictions as a CSV file
pd.DataFrame(y_pred_IBM, columns=['Predictions- BlendingRF_IBM_CV']).to_csv('Predictions- BlendingRF_IBM_CV.csv', index=False)