In [None]:

import kagglehub
ealtman2019_ibm_transactions_for_anti_money_laundering_aml_path = kagglehub.dataset_download('ealtman2019/ibm-transactions-for-anti-money-laundering-aml')

print('Data source import complete.')


# **IMPORT STATEMENTS**

In [None]:
#IMPORT STATEMENTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **LOAD THE DATASET**

In [None]:
df=pd.read_csv("/kaggle/input/ibm-transactions-for-anti-money-laundering-aml/HI-Small_Trans.csv")

In [None]:
df.head()

In [None]:
df.info()

# **DROP THE DUPLICATES**

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

# **Data Distribution**

In [None]:
df["Is Laundering"].value_counts()

In [None]:
df["Is Laundering"].value_counts(normalize=True)*100

In [None]:
target_count=df["Is Laundering"].value_counts()
plt.subplot(1,2,1)
sns.barplot(x=target_count.index,y=target_count.values)
plt.subplot(1,2,2)
plt.pie(target_count.values,labels=target_count.index);

# **Deal  with TimeStamps**

In [None]:
# Convert the "Timestamp" column to datetime format
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Extract Date, Day, and Time from the Timestamp
df["Date"] = df["Timestamp"].dt.date
df["Day"] = df["Timestamp"].dt.day_name()
df["Time"] = df["Timestamp"].dt.time

df.drop(columns=["Timestamp"], inplace=True)

In [None]:
df.head(2)

# **Define the X and Y**

In [None]:
X = df.drop(['Is Laundering'],axis = 1)
y = df['Is Laundering']

# **Under Sampling for dealing with Imbalanced Dataset**

In [None]:
# Define the undersampler
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler()

In [None]:
X,y=undersampler.fit_resample(X,y)

# **Visualizing The balanced Dataset**

In [None]:
new_distribution=y.value_counts()
plt.figure(figsize=(5,3))
sns.barplot(x=new_distribution.index,y=new_distribution.values)

# **SPLITTING**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

# **ONE HOT ENCODING with PIPELINE**

In [None]:
categorical = ['From Bank','Account','To Bank','Receiving Currency','Payment Currency','Payment Format','Day']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
ohe = Pipeline([('Encoder',OneHotEncoder(drop = 'first',handle_unknown='ignore'))])

In [None]:
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer([('One Hot Encoding',ohe,categorical)])

In [None]:
from xgboost import XGBClassifier
# Define the full pipeline without the undersampler
model = Pipeline([
    ('Transformer', transformer),           # Preprocess the data
    ('Estimator', XGBClassifier())          # Train the model with XGBoost
])

# **Fitting the model**

In [None]:
model.fit(X_train,y_train)

# **PREDICTION**

In [None]:
#Predictions
y_pred = model.predict(X_test)

# **EVALUATION**

In [None]:
from sklearn.metrics import confusion_matrix
plt.title("Confusion Matrix:")
cm=confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d",cmap="Blues")
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")

In [None]:
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
print("Accuracy Score:", accuracy_score(y_test, y_pred)*100)
print("Precision Score:",precision_score(y_test, y_pred)*100)
print("Recall Score:",recall_score(y_test, y_pred)*100)
print("F1 Score:", f1_score(y_test, y_pred)*100)