In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

RANDOM_STATE = 42


In [5]:
# Load the dataset
df_path = '../data/raw/creditcard.csv'

df_raw = pd.read_csv(df_path)
df_eda = df_raw.copy()


In [6]:

# Basic null checks
null_summary = df_eda.isnull().sum()
print("Null counts per column:\n", null_summary[null_summary > 0])

Null counts per column:
 Series([], dtype: int64)


In [7]:
# Features and target
X = df_eda.drop(columns=['Class'])
y = df_eda['Class'].astype(int)  # ensure discrete labels

In [8]:

# Train/test split (stratified because dataset is imbalanced)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [9]:

# Impute missing values (fit on train only) and scale
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_test_imp = imputer.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

In [None]:

# Train SVM on full feature set
svm_clf = SVC(kernel='rbf', C=1.0, probability=True, class_weight='balanced', random_state=RANDOM_STATE)
svm_clf.fit(X_train_scaled, y_train)

In [None]:
# Predictions on test set
y_pred = svm_clf.predict(X_test_scaled)
y_proba = svm_clf.predict_proba(X_test_scaled)[:, 1]

print("Classification report (test):")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

In [None]:

# For visualization we reduce to 2D with PCA and train a 2D SVM for decision boundary plotting
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

svm_2d = SVC(kernel='rbf', C=1.0, probability=True, class_weight='balanced', random_state=RANDOM_STATE)
svm_2d.fit(X_train_pca, y_train)

# Create mesh for decision boundary
x_min, x_max = np.min(np.vstack([X_train_pca[:,0], X_test_pca[:,0]])) - 1, np.max(np.vstack([X_train_pca[:,0], X_test_pca[:,0]])) + 1
y_min, y_max = np.min(np.vstack([X_train_pca[:,1], X_test_pca[:,1]])) - 1, np.max(np.vstack([X_train_pca[:,1], X_test_pca[:,1]])) + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))
grid = np.c_[xx.ravel(), yy.ravel()]
Z = svm_2d.predict(grid).reshape(xx.shape)

In [None]:
# For clarity, sample points for plotting (plot all positives to keep fraud points visible)
# Limit negatives to a reasonable number for visualization speed
neg_idx = np.where(y_test == 0)[0]
pos_idx = np.where(y_test == 1)[0]

n_neg_sample = min(2000, len(neg_idx))
neg_sample_idx = np.random.RandomState(RANDOM_STATE).choice(neg_idx, size=n_neg_sample, replace=False)
plot_idx = np.concatenate([pos_idx, neg_sample_idx])

plt.figure(figsize=(10, 7))
plt.contourf(xx, yy, Z, alpha=0.2, cmap='coolwarm')
plt.scatter(X_test_pca[plot_idx,0][y_test.iloc[plot_idx] == 0], X_test_pca[plot_idx,1][y_test.iloc[plot_idx] == 0], c='C0', alpha=0.7, s=40, label='Class 0')
plt.scatter(X_test_pca[plot_idx,0][y_test.iloc[plot_idx] == 1], X_test_pca[plot_idx,1][y_test.iloc[plot_idx] == 1], c='C1', alpha=0.7, s=40, label='Class 1')
plt.title('SVM decision boundary (PCA-projected features)')
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.legend(title='Class')
plt.show()