In [None]:
import pandas as pd
import numpy as np
import psycopg2
from dotenv import load_dotenv
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE



load_dotenv()

conn = psycopg2.connect(
    host=os.getenv("DB_HOST"),
    database=os.getenv("DB_NAME"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD")
)

query = """
SELECT
    f.sender_txn_count_24h,
    f.sender_txn_count_1h,
    f.sender_avg_amount_24h,
    f.time_since_last_txn,
    f.receiver_txn_count_24h,
    f.amount_to_sender_avg_ratio,
    f.balance_drain_ratio,
    f.amount_change_ratio,
    f.is_time_compressed,
    f.is_new_sender,
    f.is_transfer_or_cashout,

    /* Labels */
    COALESCE(l.is_flagged, FALSE) AS is_flagged,
    COALESCE(l.is_fraud, FALSE)   AS is_fraud

FROM transaction_features f
LEFT JOIN fraud_labels l
ON f.transaction_id = l.transaction_id;

"""

df = pd.read_sql(query, conn)
conn.close()


# Handle NULLs 
df["time_since_last_txn"] = df["time_since_last_txn"].fillna(999)


X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]


  df = pd.read_sql(query, conn)


In [None]:
from sklearn.preprocessing import LabelEncoder
X.info()

le = LabelEncoder()
y = le.fit(y)

<class 'pandas.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   sender_txn_count_24h        int64  
 1   sender_txn_count_1h         int64  
 2   sender_avg_amount_24h       float64
 3   time_since_last_txn         float64
 4   receiver_txn_count_24h      int64  
 5   amount_to_sender_avg_ratio  float64
 6   balance_drain_ratio         float64
 7   amount_change_ratio         float64
 8   is_time_compressed          bool   
 9   is_new_sender               bool   
 10  is_flagged                  bool   
dtypes: bool(3), float64(5), int64(3)
memory usage: 406.5 MB


In [10]:
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]

In [15]:
num_col = [col for col in X.columns if (X[col].dtype == 'int64') or (X[col].dtype == 'float64')]
cat_col = [col for col in X.columns if (X[col].dtype == 'bool') ]

print(num_col)
(print(cat_col))
print(len(num_col))

['sender_txn_count_24h', 'sender_txn_count_1h', 'sender_avg_amount_24h', 'time_since_last_txn', 'receiver_txn_count_24h', 'amount_to_sender_avg_ratio', 'balance_drain_ratio', 'amount_change_ratio']
['is_time_compressed', 'is_new_sender', 'is_flagged']
8


In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
sc = StandardScaler()

X_train[num_col] = sc.fit_transform(X_train[num_col])
X_test[num_col] = sc.transform(X_test[num_col])


In [None]:
enc = OrdinalEncoder()

X_train[cat_col] = enc.fit_transform(X_train[cat_col])
X_test[cat_col] = enc.transform(X_test[cat_col])




In [27]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

model = LogisticRegression( class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train_res, y_train_res)

y_prob = model.predict_proba(X_test)[:,1]

thresholds = [0.4, 0.42, 0.45, 0.48, 0.5]
for val in thresholds:
    y_pred = (y_prob > val).astype(int)
    print("Classification report for :", val, "\n" )
    print(classification_report(y_test, y_pred))

Classification report for : 0.4 

              precision    recall  f1-score   support

       False       1.00      0.77      0.87   1270881
        True       0.00      0.77      0.01      1643

    accuracy                           0.77   1272524
   macro avg       0.50      0.77      0.44   1272524
weighted avg       1.00      0.77      0.87   1272524

Classification report for : 0.42 

              precision    recall  f1-score   support

       False       1.00      0.82      0.90   1270881
        True       0.01      0.72      0.01      1643

    accuracy                           0.82   1272524
   macro avg       0.50      0.77      0.46   1272524
weighted avg       1.00      0.82      0.90   1272524

Classification report for : 0.45 

              precision    recall  f1-score   support

       False       1.00      0.87      0.93   1270881
        True       0.01      0.66      0.01      1643

    accuracy                           0.87   1272524
   macro avg       0.50 

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
rf_model = RandomForestClassifier(n_estimators = 300, max_depth=None, min_samples_leaf= 5,  random_state=42)
rf_model.fit(X_train_res, y_train_res)

prob = rf_model.predict_proba(X_test)[:,1]

for val in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
    pred = (prob >= val).astype(int)
    print(val)
    print(classification_report(y_test, pred))

print("PR-AUC:", average_precision_score(y_test, prob))

0.05
              precision    recall  f1-score   support

       False       1.00      0.82      0.90   1270881
        True       0.01      1.00      0.01      1643

    accuracy                           0.82   1272524
   macro avg       0.50      0.91      0.46   1272524
weighted avg       1.00      0.82      0.90   1272524

0.1
              precision    recall  f1-score   support

       False       1.00      0.98      0.99   1270881
        True       0.06      1.00      0.12      1643

    accuracy                           0.98   1272524
   macro avg       0.53      0.99      0.56   1272524
weighted avg       1.00      0.98      0.99   1272524

0.2
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1270881
        True       0.38      1.00      0.55      1643

    accuracy                           1.00   1272524
   macro avg       0.69      1.00      0.78   1272524
weighted avg       1.00      1.00      1.00   1272524

0.3
  

In [30]:
rf_model = RandomForestClassifier(n_estimators = 300, max_depth=None, min_samples_leaf= 5,  random_state=42)
rf_model.fit(X_train, y_train)

prob = rf_model.predict_proba(X_test)[:,1]

for val in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
    pred = (prob >= val).astype(int)
    print(val)
    print(classification_report(y_test, pred))

print("PR-AUC:", average_precision_score(y_test, prob))

0.05
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1270881
        True       0.63      1.00      0.77      1643

    accuracy                           1.00   1272524
   macro avg       0.81      1.00      0.88   1272524
weighted avg       1.00      1.00      1.00   1272524

0.1
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1270881
        True       0.63      1.00      0.77      1643

    accuracy                           1.00   1272524
   macro avg       0.82      1.00      0.89   1272524
weighted avg       1.00      1.00      1.00   1272524

0.2
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1270881
        True       0.64      0.99      0.78      1643

    accuracy                           1.00   1272524
   macro avg       0.82      0.99      0.89   1272524
weighted avg       1.00      1.00      1.00   1272524

0.3
  