EDA - Financial Fraud

In [1]:
import pandas as pd

from financial_fraud.io.hf import download_dataset_hf
from financial_fraud.config import REPO_ID, REVISION, TRANSACTION_LOG

In [2]:
offline = download_dataset_hf(
        repo_id=REPO_ID,
        filename=TRANSACTION_LOG,
        revision=REVISION,
    )

df = pd.read_parquet(offline)

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


Data types per feature

In [10]:
df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
dtype: object

NaN count per feature

In [9]:
print(df.isna().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64


Ranges for numeric features

In [11]:
print(df.select_dtypes(include="number").agg(["min", "max"]))


     step       amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
min     1         0.00            0.0            0.00             0.0   
max   355  92445516.64     43818855.3     43686616.33     355553416.3   

     newbalanceDest  isFraud  
min             0.0        0  
max     355553416.3        1  


Unique values in non numeric feature

In [12]:
df["type"].unique()


array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

Class imbalance

In [13]:
df["isFraud"].value_counts()


isFraud
0    5109921
1       3963
Name: count, dtype: int64

Unique entity count

In [15]:
print("unique nameOrig:", df["nameOrig"].nunique())

df["nameOrig"].value_counts().head(20)


unique nameOrig: 5107768


nameOrig
C400299098     3
C1677795071    3
C1784010646    3
C1065307291    3
C2098525306    3
C1999539787    3
C1976208114    3
C1530544995    3
C1925717042    2
C1377915835    2
C168273712     2
C409681213     2
C1160169686    2
C822433032     2
C975965020     2
C1527160232    2
C1716969671    2
C192337393     2
C575016270     2
C1643003822    2
Name: count, dtype: int64

In [16]:
print("unique nameDest:", df["nameDest"].nunique())

df["nameDest"].value_counts().head(20)

unique nameDest: 2178399


nameDest
C1286084959    110
C985934102     107
C665576141     105
C2083562754    100
C1590550415     99
C248609774      98
C451111351      98
C1789550256     97
C1360767589     96
C97730845       95
C392292416      92
C977993101      92
C1899073220     91
C1023714065     91
C306206744      89
C716083600      89
C909295153      88
C1782113663     87
C998351292      87
C1674899618     86
Name: count, dtype: int64

Entity analysis

In [4]:
def prior_seen_rate(df: pd.DataFrame, col: str, order_col: str = "step"):
    s = df[[order_col, col]].sort_values(order_col)[col]
    prior_seen = s.duplicated(keep="first")
    return prior_seen.mean(), prior_seen.sum(), len(s)

for c in ["nameOrig", "nameDest"]:
    rate, n, N = prior_seen_rate(df, c, "step")
    print(f"{c}: rows with prior history = {rate:.3%} ({n:,}/{N:,})")

nameOrig: rows with prior history = 0.120% (6,116/5,113,884)
nameDest: rows with prior history = 57.402% (2,935,485/5,113,884)


In [None]:


# df has columns: step (hour), nameDest, amount
d = df.sort_values(["nameDest", "step"]).copy()
d["gap_hours"] = d.groupby("nameDest")["step"].diff()

# How often are repeat dest hits within 24h?
within_24 = (d["gap_hours"] <= 24).mean()

# Per-dest: fraction of its repeats that happen within 24h
per_dest_within_24 = d.groupby("nameDest")["gap_hours"].apply(lambda s: (s <= 24).mean())
summary = per_dest_within_24.describe()

within_24, summary


(np.float64(0.4283061563383135),
 count    2.178399e+06
 mean     8.542994e-02
 std      2.179248e-01
 min      0.000000e+00
 25%      0.000000e+00
 50%      0.000000e+00
 75%      0.000000e+00
 max      9.726027e-01
 Name: gap_hours, dtype: float64)