In [73]:

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")


In [74]:

data_path = "C://Users//aditi//OneDrive//Desktop//fraud_detection//fraud_data - Sheet 1.csv"
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,TransactionID,Amount,Time,Location,MerchantCategory,CardHolderAge,IsFraud
0,1,375.17,47605,Houston,Travel,18.0,0
1,2,950.76,38088,Los Angeles,Electronics,28.0,0
2,3,732.26,78752,Miami,Travel,20.0,0
3,4,599.06,55284,New York,Groceries,69.0,0
4,5,156.86,57043,New York,Groceries,79.0,0


In [75]:
df = df.drop(columns=["TransactionID"], errors="ignore")
print(df.columns.tolist())

['Amount', 'Time', 'Location', 'MerchantCategory', 'CardHolderAge', 'IsFraud']


In [76]:

fraud_rate = y.mean()
fraud_rate


0.054

In [77]:
TARGET = "IsFraud"
X = df.drop(columns=[TARGET])
y = df[TARGET]

print("X shape:", X.shape)
print("Fraud rate:", y.mean())

X shape: (500, 5)
Fraud rate: 0.054


In [78]:
X["Time"] = pd.to_datetime(X["Time"], errors="coerce")
X["Hour"] = X["Time"].dt.hour
X["IsNight"] = X["Hour"].isin([0,1,2,3,4,5]).astype(int)
X = X.drop(columns=["Time"])

X.head()

Unnamed: 0,Amount,Location,MerchantCategory,CardHolderAge,Hour,IsNight
0,375.17,Houston,Travel,18.0,0,1
1,950.76,Los Angeles,Electronics,28.0,0,1
2,732.26,Miami,Travel,20.0,0,1
3,599.06,New York,Groceries,69.0,0,1
4,156.86,New York,Groceries,79.0,0,1


In [79]:
X["Amount"] = np.log1p(X["Amount"])
print("Amount after log transform:")
X["Amount"].describe()

Amount after log transform:


count   475.0000
mean      5.9481
std       1.0753
min       1.9544
25%       5.5004
50%       6.2535
75%       6.6554
max       9.1791
Name: Amount, dtype: float64

In [80]:
X["AgeGroup"] = pd.cut(
    X["CardHolderAge"],
    bins=[18,25,35,50,65,100],
    labels=["18-25","26-35","36-50","51-65","65+"]
)

X.head()

Unnamed: 0,Amount,Location,MerchantCategory,CardHolderAge,Hour,IsNight,AgeGroup
0,5.93,Houston,Travel,18.0,0,1,
1,6.8583,Los Angeles,Electronics,28.0,0,1,26-35
2,6.5975,Miami,Travel,20.0,0,1,18-25
3,6.397,New York,Groceries,69.0,0,1,65+
4,5.0617,New York,Groceries,79.0,0,1,65+


In [81]:
def bucket_rare_categories(series, min_freq=5):
    vc = series.value_counts()
    rare = vc[vc < min_freq].index
    return series.replace(rare, "Other")

X["Location"] = bucket_rare_categories(X["Location"])
X["MerchantCategory"] = bucket_rare_categories(X["MerchantCategory"])

print("Location value counts:")
print(X["Location"].value_counts())

Location value counts:
Location
Miami          104
Chicago        104
Los Angeles     97
New York        90
Houston         80
Name: count, dtype: int64


In [82]:
numeric_features = X.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object","category"]).columns.tolist()

numeric_features, categorical_features

(['Amount', 'CardHolderAge'], ['Location', 'MerchantCategory', 'AgeGroup'])

In [83]:
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

In [84]:
categorical_encoder = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    min_frequency=5
)


In [85]:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", numeric_imputer),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline([
            ("imputer", categorical_imputer),
            ("encoder", categorical_encoder)
        ]), categorical_features)
    ]
)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Fraud rate (train):", y_train.mean())
print("Fraud rate (test):", y_test.mean())

Train shape: (400, 7)
Test shape: (100, 7)
Fraud rate (train): 0.055
Fraud rate (test): 0.05


In [87]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)

Processed train shape: (400, 17)
Processed test shape: (100, 17)


In [88]:
feature_names = preprocessor.get_feature_names_out()

X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)

X_train_df.head()

Unnamed: 0,num__Amount,num__CardHolderAge,cat__Location_Chicago,cat__Location_Houston,cat__Location_Los Angeles,cat__Location_Miami,cat__Location_New York,cat__MerchantCategory_Clothing,cat__MerchantCategory_Electronics,cat__MerchantCategory_Entertainment,cat__MerchantCategory_Groceries,cat__MerchantCategory_Travel,cat__AgeGroup_18-25,cat__AgeGroup_26-35,cat__AgeGroup_36-50,cat__AgeGroup_51-65,cat__AgeGroup_65+
0,-1.5235,-0.3305,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.2884,-1.4674,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,-0.0542,-0.8177,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.2595,-0.4929,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.6757,0.6981,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:

assert X_train_df.isnull().sum().sum() == 0
assert X_test_df.isnull().sum().sum() == 0
assert np.isfinite(X_train_df.values).all()
assert np.isfinite(X_test_df.values).all()
assert X_train_df.shape[1] == X_test_df.shape[1]


In [None]:
processed_dir = "C://Users//aditi//OneDrive//Desktop//fraud_detection//processed"
os.makedirs(processed_dir, exist_ok=True)

X_train_df.to_csv(f"{processed_dir}/X_train.csv", index=False)
X_test_df.to_csv(f"{processed_dir}/X_test.csv", index=False)
y_train.to_csv(f"{processed_dir}/y_train.csv", index=False)
y_test.to_csv(f"{processed_dir}/y_test.csv", index=False)

print("Saved files:")
print(os.listdir(processed_dir))

Saved files:
['X_test.csv', 'X_train.csv', 'y_test.csv', 'y_train.csv']
