In [1]:
# Cell 1: Imports and Setup (Resampled Dataset)
!pip install pandas numpy matplotlib seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

# Load the resampled dataset
resampled_file = 'C:/projects/Credit-Card-Fraud-Detection/data/processed/creditcard_2023_balanced.csv'
if not os.path.exists(resampled_file):
    print(f"Resampled dataset not found at {resampled_file}. Please ensure the resampling step was completed.")
    exit()
df = pd.read_csv(resampled_file)

# Save to the raw directory (for consistency with your pipeline)
project_raw_dir = "C:/projects/Credit-Card-Fraud-Detection/data/raw"
os.makedirs(project_raw_dir, exist_ok=True)
df.to_csv(os.path.join(project_raw_dir, 'creditcard_2023_balanced.csv'), index=False)

# Verify the dataset
print("Resampled Dataset Shape:", df.shape)
print("Resampled Class Distribution:")
print(df['Class'].value_counts(normalize=True))


[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Resampled Dataset Shape: (285743, 31)
Resampled Class Distribution:
Class
0    0.995003
1    0.004997
Name: proportion, dtype: float64


In [2]:
# Cell 2: Exploratory Data Analysis (EDA)
# Load the resampled dataset
df = pd.read_csv('C:/projects/Credit-Card-Fraud-Detection/data/raw/creditcard_2023_balanced.csv')

# Basic statistics
print("Basic Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Visualize class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Non-Fraud, 1: Fraud)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.yscale('log')  # Use log scale due to imbalance
plt.savefig('C:/projects/Credit-Card-Fraud-Detection/figures/class_distribution.png')
plt.close()

# Visualize Amount distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.savefig('C:/projects/Credit-Card-Fraud-Detection/figures/amount_distribution.png')
plt.close()

Basic Statistics:
                  id             V1             V2             V3  \
count  285743.000000  285743.000000  285743.000000  285743.000000   
mean   143854.877134       0.500620      -0.486943       0.675185   
std     84596.380478       0.827309       0.705080       0.709931   
min         0.000000      -3.495584     -49.966572      -3.183760   
25%     71611.500000      -0.140181      -0.658753       0.144421   
50%    143141.000000       0.208816      -0.421286       0.562612   
75%    214702.500000       1.088226      -0.185745       1.102254   
max    568448.000000       2.229046       4.227420      14.125834   

                  V4             V5             V6             V7  \
count  285743.000000  285743.000000  285743.000000  285743.000000   
mean       -0.728683       0.335210       0.430497       0.486273   
std         0.678638       0.664407       0.729695       0.685666   
min        -4.951222      -9.952786     -21.111108      -4.185660   
25%        -1.0

In [3]:
# Cell 3: Feature Engineering
# Load the resampled dataset
df = pd.read_csv('C:/projects/Credit-Card-Fraud-Detection/data/raw/creditcard_2023_balanced.csv')

# Create new features
df['Amount_log'] = np.log1p(df['Amount'])  # Log transform to handle skewness
df['Amount_squared'] = df['Amount'] ** 2  # Polynomial feature to capture non-linear relationships

# Drop 'id' if present (not useful for modeling)
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Save the processed dataset
project_processed_dir = "C:/projects/Credit-Card-Fraud-Detection/data/processed"
os.makedirs(project_processed_dir, exist_ok=True)
df.to_csv(os.path.join(project_processed_dir, 'creditcard_2023_processed.csv'), index=False)
print("Processed Dataset Shape:", df.shape)

Processed Dataset Shape: (285743, 32)


In [4]:
# Cell 4: Train-Test Split
# Load the processed dataset
df = pd.read_csv('C:/projects/Credit-Card-Fraud-Detection/data/processed/creditcard_2023_processed.csv')

# Split into train and test sets
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Adjust sample sizes based on available data (total rows = 285,743)
train_size = int(len(df) * 0.8)  # 80% = 228,594
test_size = len(df) - train_size  # 20% = 57,149
train_df = train_df.sample(n=train_size, random_state=42)
test_df = test_df.sample(n=test_size, random_state=42)

# Separate features and target
X_train = train_df.drop(columns=['Class'])
y_train = train_df['Class']
X_test = test_df.drop(columns=['Class'])
y_test = test_df['Class']

# Save the splits
os.makedirs('C:/projects/Credit-Card-Fraud-Detection/data/processed', exist_ok=True)
X_train.to_csv('C:/projects/Credit-Card-Fraud-Detection/data/processed/X_train.csv', index=False)
X_test.to_csv('C:/projects/Credit-Card-Fraud-Detection/data/processed/X_test.csv', index=False)
y_train.to_csv('C:/projects/Credit-Card-Fraud-Detection/data/processed/y_train.csv', index=False)
y_test.to_csv('C:/projects/Credit-Card-Fraud-Detection/data/processed/y_test.csv', index=False)

print("Training Set Shape:", X_train.shape)
print("Test Set Shape:", X_test.shape)
print("Training Class Distribution:")
print(y_train.value_counts(normalize=True))
print("Test Class Distribution:")
print(y_test.value_counts(normalize=True))

Training Set Shape: (228594, 31)
Test Set Shape: (57149, 31)
Training Class Distribution:
Class
0    0.99507
1    0.00493
Name: proportion, dtype: float64
Test Class Distribution:
Class
0    0.994733
1    0.005267
Name: proportion, dtype: float64
