In [1]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load data

df = pd.read_csv('Fraud_data.csv')
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,date
0,2,2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,2021-11-27
1,3,3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,2021-01-01
2,251,251,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0,2021-03-28


In [3]:
# Split X and y

features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

X = df[features]
y = df['isFraud']

In [4]:
# Method 1: Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

print(average_precision_score(y_preds, y_test))

0.937078068417773


In [5]:
# Method 2: K-Fold Cross Validation

from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle = True, random_state = 42)
kf. get_n_splits(X)

folds = {}
scores = []

for train, test in kf.split(X):
    
    # Fold
    fold_number = 1
    
    # Store fold number
    folds [fold_number] = (df.iloc[train], df.iloc[test])
    
    # Create training and testing sets
    X_train = df.iloc[train][features]
    y_train = df.iloc[train]['isFraud']
    X_test = df.iloc[test][features]
    y_test = df.iloc[test]['isFraud']

    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_preds = model.predict (X_test)
    
    pr_auc = average_precision_score(y_preds, y_test)
    scores.append(pr_auc)
    
    fold_number += 1

np.mean(scores)

0.9220700066942599

In [6]:
# Also we can calculate score via cross_val_score

from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()
scores = cross_val_score(model, X, y, scoring='accuracy', cv=kf, n_jobs=-1)

print(np.mean(scores))

0.9936698935802835


In [7]:
# Method 3: Leave One Out Cross Validation (LOOCV)

from sklearn.model_selection import LeaveOneOut 
from sklearn.metrics import accuracy_score 
from sklearn.utils import shuffle

df = shuffle(df)
X = df[features]
y = df['isFraud']

loo = LeaveOneOut()
loo.get_n_splits(X)

all_preds = []

for train_index, test_index in loo.split(X[8000:8300]) :
#    print ("TRAIN:", train_index, "TEST:", test_index)
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                                                  
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    
    correct = y_preds[0] == y_test.values[0]
    
    all_preds.append(correct)

np.mean(all_preds)

0.9766666666666667

In [8]:
# Method 4: Date/Time Train-Test Split 

DATE = '2021-12-31'

train_df = df[df['date'] < DATE].copy()
test_df = df[df['date'] >= DATE].copy()

X_train = train_df[features]
X_test = test_df[features]

y_train = train_df['isFraud']
y_test = test_df['isFraud']


model = RandomForestClassifier()

model.fit(X_train, y_train)
y_preds = model.predict(X_test)

print(average_precision_score(y_preds, y_test))

0.9266947146761165


In [9]:
# Method 5: Sliding Window/Time Series K-Fold

## Jan 1 to Jan 30 2022 - Training Set, Feb 1 to Feb 30 - Testing Set
## Jan 2 to Feb 2 - Training, Feb 2 to Feb 30 - Testing Set 
## Jan 3 to Feb 3 - Training, Feb 3 to Feb 30 - Testing Set
## Window of 30 days -> Sliding it across time

from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit()

all_scores = []

for train_index, test_index in tscv.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier()

    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)

    pr_auc = average_precision_score(y_preds, y_test)
    
    all_scores.append(pr_auc)
    
    
print(all_scores)

[0.8640978789556163, 0.9078894606032781, 0.929579424475134, 0.9293334032300601, 0.9307670955133693]


In [10]:
# Method 6: Expanding Window

## Jan 1 to Jan 30 2022 - Training Set, Feb 1 to Feb 30 - Testing Set - 30 days of training data
## Jan 1 to Feb 2 - Training, Feb 2 to Feb 30 - Testing Set - 31 days of training data
## Jan 1 to Feb 3 - Training, Feb 3 to Feb 30 - Testing Set - 31 days of training data
## Training data increases over time

from sklearn.model_selection import TimeSeriesSplit

class ExpandingWindowCV:
    def fit(self, date_col, date_range = None, custom_range = None):
        self.date_col = date_col
        self.date_range = date_range
        self.custom_range = custom_range
        
        if date_range is not None and custom_range is not None:
            raise ValueError("Date Range and Custom Range both cannot be None.")
    
    def split(self, df):
        if self.date_range is None:         
            dates = list(set(df[self.date_col].astype(str).values))
        
        if self.date_range is not None:
            dates = pd.date_range(start=self.date_range[0], end=self.date_range[1])
            dates = [str(d.date()) for d in dates]
        
        if self.custom_range is not None:
            dates = self.custom_range
            
        for d in dates:
            df_train = df[df[self.date_col].astype(str) <= d].copy()
            df_test = df[df[self.date_col].astype(str) > d].copy()
            yield df_train, df_test
            
ew = ExpandingWindowCV()
ew.fit(date_col = 'date', date_range = ['2022-01-02','2022-01-08'])
ew.split(df)

<generator object ExpandingWindowCV.split at 0x7fc380b29c80>

In [11]:
all_scores = []

for train_df, test_df in ew.split(df):
    X_train = train_df[features]
    X_test = test_df[features]

    y_train = train_df['isFraud']
    y_test = test_df['isFraud']


    model = RandomForestClassifier()

    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)

    pr_auc = average_precision_score(y_preds, y_test)
    
    all_scores.append(pr_auc)
    
all_scores

[0.9252896540778948,
 0.9257400752205791,
 0.9241408931790137,
 0.9263279581758166,
 0.9261284824305607,
 0.9240985945903226,
 0.9203492956477464]

In [12]:
# Method 7: Monte Carlo Cross Validation

from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
rs.get_n_splits(df)

all_scores = []
for train_index, test_index in rs.split(df):
#     print("TRAIN:", train_index, "TEST:", test_index)

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier()

    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)

    pr_auc = average_precision_score(y_preds, y_test)
    
    all_scores.append(pr_auc)

all_scores

[0.9379626251061677,
 0.9261918463573187,
 0.9337765077816722,
 0.9240854814387397,
 0.9341527463142527]