In [None]:
import pandas as pd
from IPython.display import display, Markdown

In [21]:
df = pd.read_csv('../data/SAML-D.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Time                    object 
 1   Date                    object 
 2   Sender_account          int64  
 3   Receiver_account        int64  
 4   Amount                  float64
 5   Payment_currency        object 
 6   Received_currency       object 
 7   Sender_bank_location    object 
 8   Receiver_bank_location  object 
 9   Payment_type            object 
 10  Is_laundering           int64  
 11  Laundering_type         object 
dtypes: float64(1), int64(3), object(8)
memory usage: 870.2+ MB


In [22]:
display(Markdown("## Exploratory Data Analysis (EDA)"))

## Exploratory Data Analysis (EDA)

In [7]:
df.describe()

Unnamed: 0,Sender_account,Receiver_account,Amount,Is_laundering
count,9504852.0,9504852.0,9504852.0,9504852.0
mean,5006619000.0,5006006000.0,8762.968,0.001038733
std,2885814000.0,2884763000.0,25614.95,0.03221263
min,9018.0,9018.0,3.73,0.0
25%,2513133000.0,2513219000.0,2143.688,0.0
50%,5001017000.0,5002572000.0,6113.72,0.0
75%,7505051000.0,7502397000.0,10458.46,0.0
max,9999987000.0,9999971000.0,12618500.0,1.0


In [12]:
df['Date'] = pd.to_datetime(df['Date'])
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.time
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   Time                    object        
 1   Date                    datetime64[ns]
 2   Sender_account          int64         
 3   Receiver_account        int64         
 4   Amount                  float64       
 5   Payment_currency        object        
 6   Received_currency       object        
 7   Sender_bank_location    object        
 8   Receiver_bank_location  object        
 9   Payment_type            object        
 10  Is_laundering           int64         
 11  Laundering_type         object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 870.2+ MB


In [6]:
df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [23]:
missing_data = df.isna().sum()
print(missing_data)

Time                      0
Date                      0
Sender_account            0
Receiver_account          0
Amount                    0
Payment_currency          0
Received_currency         0
Sender_bank_location      0
Receiver_bank_location    0
Payment_type              0
Is_laundering             0
Laundering_type           0
dtype: int64


In [33]:
# number of transactions per payment type
display(Markdown("# Number of transactions per payment type"))
transactions_per_payment_type = df['Payment_type'].value_counts()
print('\n')

# number of laundering transactions per payment type
display(Markdown("# Number of laundering transactions per payment type"))
laundering_transactions_per_payment_type = df[df['Is_laundering'] == 1].groupby('Payment_type').size()
transactions_per_payment_type, laundering_transactions_per_payment_type

# Number of transactions per payment type





# Number of laundering transactions per payment type

(Payment_type
 Credit card        2012909
 Debit card         2012103
 Cheque             2011419
 ACH                2008807
 Cross-border        933931
 Cash Withdrawal     300477
 Cash Deposit        225206
 Name: count, dtype: int64,
 Payment_type
 ACH                1159
 Cash Deposit       1405
 Cash Withdrawal    1334
 Cheque             1087
 Credit card        1136
 Cross-border       2628
 Debit card         1124
 dtype: int64)

In [16]:
sender_bank_location = df['Sender_bank_location'].value_counts()
print(sender_bank_location)
print("\n")
receiver_bank_location = df['Receiver_bank_location'].value_counts()
print(receiver_bank_location)

Sender_bank_location
UK             9183088
Turkey           20902
Switzerland      20503
Pakistan         20346
UAE              20081
Nigeria          20027
Spain            19391
Germany          19259
USA              19027
Italy            18895
France           18702
Japan            18468
Morocco          18437
Austria          18050
Mexico           17662
Albania          17648
India            17596
Netherlands      16770
Name: count, dtype: int64


Receiver_bank_location
UK             8864634
Pakistan         38980
Austria          38844
Albania          38783
Japan            38729
Morocco          38704
Germany          38540
France           38512
Mexico           38385
Nigeria          38272
Netherlands      37314
Spain            37288
India            36936
USA              36850
Italy            36517
Switzerland      36081
UAE              35897
Turkey           35586
Name: count, dtype: int64


In [18]:
countries_stats = df.groupby('Sender_bank_location').size() + df.groupby('Receiver_bank_location').size()
countries = countries_stats.sort_values(ascending=False)
print(countries)

Sender_bank_location
UK             18047722
Pakistan          59326
Nigeria           58299
Germany           57799
France            57214
Japan             57197
Morocco           57141
Austria           56894
Spain             56679
Switzerland       56584
Turkey            56488
Albania           56431
Mexico            56047
UAE               55978
USA               55877
Italy             55412
India             54532
Netherlands       54084
dtype: int64
