In [1]:
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("anurag629/credit-card-fraud-transaction-data")
print("Path to dataset files:", path)
os.listdir(path)

Path to dataset files: /Users/sofia/.cache/kagglehub/datasets/anurag629/credit-card-fraud-transaction-data/versions/1


['CreditCardData.csv']

In [3]:
df = pd.read_csv(os.path.join(path, "CreditCardData.csv"))


In [4]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.dropna().reset_index(drop=True)
    data = data.drop(columns=['Transaction ID',
                              'Date',
                              'Shipping Address',
                              'Country of Residence'])
    data['Amount'] = data['Amount'].replace('£', '', regex=True).astype(float)
    data = pd.get_dummies(data, columns=['Merchant Group',
                                         'Type of Card',
                                         'Bank',
                                         'Gender',
                                         'Country of Transaction',
                                         'Entry Mode',
                                         'Type of Transaction'], drop_first=True)
    data['Day of Week'] = data['Day of Week'].map({
        'Monday': 0,
        'Tuesday': 1,
        'Wednesday': 2,
        'Thursday': 3,
        'Friday': 4,
        'Saturday': 5,
        'Sunday': 6
    })

    return data


def get_target(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    X = data.drop(columns=['Fraud'])
    y = data['Fraud']
    return X, y


def scaler(X: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()

    columns_to_scale = X.select_dtypes(include=['float64', 'int64']).columns
    X_scaled = X.copy()
    X_scaled[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

    return X_scaled

In [5]:
data = preprocess_data(df)
X, y = get_target(data)
X_scaled = scaler(X)
X_scaled

Unnamed: 0,Day of Week,Time,Amount,Age,Merchant Group_Electronics,Merchant Group_Entertainment,Merchant Group_Fashion,Merchant Group_Food,Merchant Group_Gaming,Merchant Group_Products,...,Bank_RBS,Gender_M,Country of Transaction_India,Country of Transaction_Russia,Country of Transaction_USA,Country of Transaction_United Kingdom,Entry Mode_PIN,Entry Mode_Tap,Type of Transaction_Online,Type of Transaction_POS
0,0.992998,0.835862,-0.871551,-1.989692,False,True,False,False,False,False,...,True,True,False,False,False,True,False,True,False,True
1,0.992998,0.459084,1.421153,0.463045,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
2,0.992998,-0.106082,-0.871551,-0.280818,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,True
3,-1.002719,-0.106082,-0.685218,0.603776,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
4,-1.002719,1.589416,-0.174828,-0.703010,True,False,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99972,-1.002719,1.401028,-0.790537,0.885237,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
99973,0.992998,1.589416,-0.855348,0.000644,False,False,False,False,False,False,...,False,True,False,True,False,False,True,False,False,False
99974,-1.002719,-0.671248,-0.741928,0.151427,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
99975,0.992998,1.401028,-0.709523,0.322314,False,False,False,False,False,True,...,False,True,False,False,False,True,False,True,False,True
