# Import libs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imblearn_Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
transactions = pd.read_csv('transactions.csv')
alerts = pd.read_csv('alerts.csv')
accounts = pd.read_csv('accounts.csv')

# Data analysis

In [3]:
print(transactions.head())
print(alerts.head())
print(accounts.head())
print(transactions.info())
print(accounts.info())
print(alerts.info())
print(transactions.isnull().sum())

   TX_ID  SENDER_ACCOUNT_ID  RECEIVER_ACCOUNT_ID   TX_TYPE  TX_AMOUNT  \
0      1               6456                 9069  TRANSFER     465.05   
1      2               7516                 9543  TRANSFER     564.64   
2      3               2445                 9356  TRANSFER     598.94   
3      4               2576                 4617  TRANSFER     466.07   
4      5               3524                 1773  TRANSFER     405.63   

   TIMESTAMP  IS_FRAUD  ALERT_ID  
0          0     False        -1  
1          0     False        -1  
2          0     False        -1  
3          0     False        -1  
4          0     False        -1  
   ALERT_ID ALERT_TYPE  IS_FRAUD  TX_ID  SENDER_ACCOUNT_ID  \
0       193     fan_in      True     82               6976   
1       377      cycle      True    949               5776   
2       189     fan_in      True   6280               9999   
3       377      cycle      True   7999               1089   
4       130     fan_in      True  12975  

In [4]:
def min_merge(transactions, accounts):
    accounts_min = accounts[['ACCOUNT_ID', 'COUNTRY', 'ACCOUNT_TYPE']]
    sender = accounts_min.add_prefix('SENDER_')
    receiver = accounts_min.add_prefix('RECEIVER_')
    merged = transactions.merge(sender, on='SENDER_ACCOUNT_ID', how='left')
    merged = merged.merge(receiver, on='RECEIVER_ACCOUNT_ID', how='left')
    keep_cols = ['TX_AMOUNT', 'TIMESTAMP', 'SENDER_COUNTRY', 'RECEIVER_COUNTRY', 'SENDER_ACCOUNT_TYPE', 'RECEIVER_ACCOUNT_TYPE', 'IS_FRAUD']
    return merged[keep_cols]

In [5]:
df = min_merge(transactions, accounts)
print(df['IS_FRAUD'].value_counts(normalize=True))

IS_FRAUD
False    0.998701
True     0.001299
Name: proportion, dtype: float64


# Feature Engineering

In [6]:
amount_bins = [0, 100, 500, 1000, 5000, np.inf]
amount_labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000+']
df['HOUR'] = pd.to_datetime(df['TIMESTAMP']).dt.hour
df['AMOUNT_BIN'] = pd.cut(df['TX_AMOUNT'], bins=amount_bins, labels=amount_labels, include_lowest=True)
df = df.drop(columns=['TIMESTAMP'])


# Target 

In [7]:
y = df['IS_FRAUD'].astype(int)
X = df.drop(columns=['IS_FRAUD'])

# Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing

In [9]:
numeric_cols = ['TX_AMOUNT', 'HOUR', 'LOG_TX_AMOUNT']
categorical_cols = ['SENDER_COUNTRY', 'RECEIVER_COUNTRY', 'SENDER_ACCOUNT_TYPE', 'RECEIVER_ACCOUNT_TYPE', 'AMOUNT_BIN']
preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')

#  Model Pipeline

In [10]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Light GBM": LGBMClassifier(),
    "NaiveBayes": GaussianNB(),
    "KNN":KNeighborsClassifier()}



In [16]:
for name, classifier in models.items():
    print(f"\n{name}:")
    pipeline = imblearn_Pipeline([
        ('preprocess', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', classifier)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))



Random Forest:
Accuracy: 0.9796
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    396455
           1       0.06      0.95      0.11       516

    accuracy                           0.98    396971
   macro avg       0.53      0.96      0.55    396971
weighted avg       1.00      0.98      0.99    396971


Logistic Regression:
Accuracy: 0.7609
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.76      0.86    396455
           1       0.01      1.00      0.01       516

    accuracy                           0.76    396971
   macro avg       0.50      0.88      0.44    396971
weighted avg       1.00      0.76      0.86    396971


Decision Tree:
Accuracy: 0.9796
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    396455
           1       0.06      0.95      0.11       516

    accu

# saving the final model (Joblib)

In [11]:

final_model = imblearn_Pipeline([
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', KNeighborsClassifier())])
final_model.fit(X_train, y_train)
joblib.dump(final_model, 'knn_model.pkl')


['knn_model.pkl']