In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# display settings
pd.set_option('display.max_columns', None)


In [4]:
df = pd.read_csv("../data/raw/Fraud_Data.csv")
df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [5]:
# create time_since_signup

# Convert signup_time and purchase_time to datetime if not already
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])

# Create time_since_signup in hours
df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600


why it helps: 
Fraudsters may make transactions very soon after signup. Unusually short intervals could be suspicious.

In [6]:
# Time Features

# Hour of the day
df['hour_of_day'] = df['purchase_time'].dt.hour

# Day of the week (0=Monday, 6=Sunday)
df['day_of_week'] = df['purchase_time'].dt.dayofweek


Why it helps: 
Fraudulent transactions may occur at odd hours or on specific days, unlike normal user behavior.

In [7]:
# Transaction Behavior

# Number of transactions per user
user_tx_counts = df.groupby('user_id')['purchase_time'].count()
df['num_transactions'] = df['user_id'].map(user_tx_counts)

# Transaction velocity (transactions per hour)
user_tx_hours = df.groupby('user_id')['time_since_signup'].max()  # time span for each user
df['tx_velocity'] = df['num_transactions'] / df['user_id'].map(user_tx_hours)


Why it helps: 
Rapid or multiple transactions in a short time can indicate fraudulent activity.

In [8]:
# Encoding & Scaling

from sklearn.preprocessing import StandardScaler

# Columns to scale
scale_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week', 'num_transactions', 'tx_velocity']

# Ensure only existing columns are selected
scale_cols = [col for col in scale_cols if col in df.columns]

# Apply StandardScaler
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])




In [9]:
import os

# Create directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Save the file
df.to_csv('data/processed/fraud_data_processed.csv', index=False)


In [10]:
# Handle Class Imbalance

# Show class distribution
df['class'].value_counts(normalize=True)


class
0    0.906354
1    0.093646
Name: proportion, dtype: float64

In [11]:
# Show class distribution

# Split data first
from sklearn.model_selection import train_test_split

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
# Apply SMOTE to training data only

# Combine X_train and y_train for convenience
train_df = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes
df_majority = train_df[train_df['class'] == 0]  # non-fraud
df_minority = train_df[train_df['class'] == 1]  # fraud

# Downsample majority class to match minority class size
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)

# Combine minority class with downsampled majority class
train_balanced = pd.concat([df_minority, df_majority_downsampled])

# Shuffle the data
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate back into features and target
X_train_res = train_balanced.drop('class', axis=1)
y_train_res = train_balanced['class']

# Show new class distribution
print("Balanced class distribution:")
print(y_train_res.value_counts())


Balanced class distribution:
class
0    11321
1    11321
Name: count, dtype: int64


In [13]:
# Show class distribution after

pd.Series(y_train_res).value_counts()


class
0    11321
1    11321
Name: count, dtype: int64

Why SMOTE:

Balances the minority class (fraud) by synthetically generating new samples, improving model learning.
Never apply to test data, because it must remain realistic for evaluation.

class distribution before:
class
0    0.906354
1    0.093646
Name: proportion, dtype: float64
Shows the original imbalance (90% non-fraud, 10% fraud)

class distribution after:
class
0    11321
1    11321
Name: count, dtype: int64
Shows a perfectly balanced training set

Original training data was highly imbalanced (90% non-fraud, 10% fraud).
To improve model learning on the minority class, we applied random undersampling of the majority class.
This creates a balanced training set while keeping the test set realistic, allowing the model to detect fraud more effectively.