In [28]:
import pandas as pd

In [29]:
import pandas as pd

df = pd.read_csv("./realistic_fraud_dataset.csv")
print(df.info())
print(df["is_fraud"].value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508706 entries, 0 to 1508705
Data columns (total 15 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   sender_id            1508706 non-null  int64  
 1   recipient_id         1508706 non-null  int64  
 2   amount               1508706 non-null  float64
 3   time_of_day          1508706 non-null  float64
 4   day_of_week          1508706 non-null  int64  
 5   transaction_type     1508706 non-null  object 
 6   avg_amount_last_10   1508706 non-null  float64
 7   txn_count_last_24h   1508706 non-null  float64
 8   recipient_diversity  1508706 non-null  int64  
 9   is_known_recipient   1508706 non-null  int64  
 10  device_id            1508706 non-null  object 
 11  geo_distance_km      1508706 non-null  float64
 12  ip_change_flag       1508706 non-null  int64  
 13  is_fraud             1508706 non-null  int64  
 14  timestamp            1508706 non-null  object 
dty

In [30]:

# Sample rows
print(df.head())



   sender_id  recipient_id  amount  time_of_day  day_of_week transaction_type  \
0          1          4658   44.63     1.233333            3         transfer   
1          1          1434   32.13     5.633333            5         purchase   
2          1          1434   18.57    14.916667            5         purchase   
3          1          1434   48.65    22.183333            5           refund   
4          1          3293   34.33     8.316667            6           refund   

   avg_amount_last_10  txn_count_last_24h  recipient_diversity  \
0           44.630000                 1.0                    1   
1           38.380000                 1.0                    2   
2           31.776667                 2.0                    2   
3           35.995000                 3.0                    2   
4           35.662000                 3.0                    3   

   is_known_recipient                             device_id  geo_distance_km  \
0                   0  c5114889-9184

In [32]:
# Descriptive stats
print(df.describe(include='all'))



           sender_id  recipient_id        amount   time_of_day   day_of_week  \
count   1.508706e+06  1.508706e+06  1.508706e+06  1.508706e+06  1.508706e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    2.483942e+03  2.497221e+03  1.196031e+02  1.200069e+01  2.999914e+00   
std     1.442511e+03  1.434675e+03  9.442770e+01  6.927429e+00  1.997293e+00   
min     1.000000e+00  1.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00   
25%     1.233000e+03  1.271000e+03  5.501000e+01  6.000000e+00  1.000000e+00   
50%     2.477000e+03  2.479000e+03  1.006300e+02  1.200000e+01  3.000000e+00   
75%     3.729000e+03  3.731000e+03  1.601100e+02  1.801667e+01  5.000000e+00   
max     5.000000e+03  5.000000e+03  1.190580e+03  2.398333e+01  6.000000e+00   

       transaction_type  avg_amount_las

In [33]:
# Class distribution
print("Fraud distribution:\n", df['is_fraud'].value_counts(normalize=True))



Fraud distribution:
 is_fraud
0    0.901485
1    0.098515
Name: proportion, dtype: float64


In [34]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 sender_id              0
recipient_id           0
amount                 0
time_of_day            0
day_of_week            0
transaction_type       0
avg_amount_last_10     0
txn_count_last_24h     0
recipient_diversity    0
is_known_recipient     0
device_id              0
geo_distance_km        0
ip_change_flag         0
is_fraud               0
timestamp              0
dtype: int64


In [31]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [35]:
# Drop IDs and timestamp
df = df.drop(columns=["sender_id", "recipient_id", "device_id", "timestamp"])

In [36]:
# Split features and labels
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]


In [37]:
# Define categorical and numerical columns
categorical = ["time_of_day", "transaction_type"]
numerical = [col for col in X.columns if col not in categorical]


In [38]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical),
    ("cat", OneHotEncoder(drop="first"), categorical)
])

In [39]:
# Build model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [40]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [None]:
pipeline.fit(X_train, y_train)


In [24]:
# Evaluate
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     62306
           1       1.00      1.00      1.00      6826

    accuracy                           1.00     69132
   macro avg       1.00      1.00      1.00     69132
weighted avg       1.00      1.00      1.00     69132

Confusion Matrix:
 [[62306     0]
 [    0  6826]]


In [25]:
# Save model
joblib.dump(pipeline, "fraud_model.pkl")
print("✅ Model saved as fraud_model.pkl")

✅ Model saved as fraud_model.pkl
