# Train a simple model for fraud detection

In this notebook, we train simple fraud detection model that we will be feeding into our fraud detection app.

## Set up

#### User-specified parameters

In [1]:
python_material_folder_name = "python-material"

### Import libraries

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check if in Google Colab environment
try:
    from google.colab import drive
    # Mount drive
    drive.mount('/content/drive')
    # Set up path to Python material parent folder
    path_python_material = rf"drive/MyDrive/{python_material_folder_name}"
        # If unsure, print current directory path by executing the following in a new cell:
        # !pwd
    IN_COLAB = True
except:
    IN_COLAB = False
    # If working locally on Jupyter Notebook, parent folder is one folder up (assuming you are using the folder structure shared at the beginning of the course)
    path_python_material = ".."

  from pandas.core import (


In [3]:
if IN_COLAB == True:
  !pip install fastapi uvicorn

## Data import

In [4]:
# Build quick model here for fraud data
import pandas as pd

df = pd.read_csv(f"{path_python_material}/data/1-raw/dsif11-fraud-detection/synthetic_transaction_data.csv")
df.head()


Unnamed: 0,transaction_amount,transaction_date,transaction_time,customer_age,customer_balance,is_fraud
0,46.926809,2023-02-09,55817,43,9143.802446,0
1,301.012143,2023-01-28,9356,60,3126.627558,0
2,131.674569,2023-11-13,33099,33,4316.836831,0
3,91.294255,2023-03-26,3190,18,4235.945356,0
4,16.962487,2023-12-07,13332,49,5491.237144,0


In [5]:
df.shape

(100000, 6)

## Simple fraud detection model

### Model training using sklearn pipelines

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pickle

In [7]:
# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

In [8]:
# Select features and target
features = ['transaction_amount', 'customer_age', 'customer_balance']
X = df[features]
y = df['is_fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

#### Dumping i.e. saving model as binary file

In [9]:
model_id = "lr1"

# Save the entire pipeline
with open(f"{path_python_material}/models/{model_id}-pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

#### Loading model previously saved i.e. saving model as binary file

In [10]:
# Load the pipeline
with open(f"{path_python_material}/models/{model_id}-pipeline.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

# Make predictions
predictions = loaded_pipeline.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

### Model evaluation

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, loaded_pipeline.predict_proba(X_test)[:,1])

# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print(f'Confusion Matrix:\n{cm}')


Accuracy: 0.98875
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
ROC-AUC: 0.8173338952100013
Confusion Matrix:
[[19775     0]
 [  225     0]]


  _warn_prf(average, modifier, msg_start, len(result))
