# 1. Data Import

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('creditcard.csv')

df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [2]:
# No null value:
df.isnull().sum().any()

np.False_

In [3]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# FEATURE SCALING EXPLANATION:
#   - Using RobustScaler for Amount is recommended because:
#     1. Transaction amounts often contain extreme outliers (e.g., very large purchases)
#     2. RobustScaler uses median and IQR (Interquartile Range), making it resistant to outliers

df['scaled_amount'] = RobustScaler().fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [4]:
# We can see that the data is very imbalanced
print(f'Fraudulent numbers:{df['Class'].value_counts()[1]} / {len(df['Class'])}')

Fraudulent numbers:492 / 284807


# 2. Under Sampling and Logistic Regression

In [None]:
# Sampling the dataset to create 1:1 balanced dataset, these could enlarge
# the probability of meeting fraud transactions, increasing the recall rate
# but of course will increase the chance judging a non-fraudulent transaction
# into fraudulent transaction by mistake.
df_normal = df[df['Class'] ==0]
df_fraud = df[df['Class'] ==1]

df_normal_sampled = df_normal.sample(n=len(df_fraud), random_state=42)
balanced_df = pd.concat([df_normal_sampled, df_fraud])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

X = balanced_df.drop(['Class'], axis=1)
y = balanced_df['Class']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(
    max_iter=1000,
    solver='liblinear',
    random_state=42,
    penalty='l1',
)

model.fit(X_train, y_train)


In [None]:
X_test

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve

# Evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Normal', 'Fraud']))

conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

TN, FP, FN, TP = conf_matrix.ravel()
fraud_recall = TP / (TP + FN)  # True Positive Rate (Sensitivity)
fraud_precision = TP / (TP + FP)  # Positive Predictive Value
f1_score = 2 * (fraud_precision * fraud_recall) / (fraud_precision + fraud_recall)

print(f"\nFraud-Specific Metrics:")
print(f"Recall (Sensitivity): {fraud_recall:.4f} - Ability to detect actual frauds")
print(f"Precision: {fraud_precision:.4f} - Accuracy when predicting fraud")
print(f"F1-Score: {f1_score:.4f} - Balance between precision and recall")
print(f"False Positive Rate: {FP/(FP+TN):.4f} - Normal transactions misclassified as fraud")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))

# Confusion Matrix Heatmap
plt.subplot(2, 2, 1)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Normal', 'Predicted Fraud'],
            yticklabels=['Actual Normal', 'Actual Fraud'])
plt.title('Confusion Matrix (Balanced Data)')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')

plt.show()

In [None]:
plt.subplot(2, 2, 2)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

In [None]:
plt.subplot(2, 2, 3)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall, precision)
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.2f})')
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="upper right")

In [None]:
plt.subplot(2, 2, 4)
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.coef_[0]
}).sort_values('Importance', ascending=False)

# Only show top 10 most important features
top_features = feature_importance.head(10)
sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')
plt.title('Top Fraud Indicators')
plt.tight_layout()
plt.show()

In [None]:
thresholds = np.arange(0.1, 1.0, 0.1)
results = []

for thresh in thresholds:
    # Convert probabilities to class predictions using threshold
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_thresh)
    TN, FP, FN, TP = conf_matrix.ravel()

    # Calculate metrics
    recall = TP / (TP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    fpr = FP / (FP + TN)

    results.append({
        'Threshold': thresh,
        'Recall': recall,
        'Precision': precision,
        'F1-Score': f1,
        'FPR': fpr
    })

# Create results dataframe
results_df = pd.DataFrame(results)
print("\nPerformance at Different Thresholds:")
print(results_df[['Threshold', 'Recall', 'Precision', 'F1-Score', 'FPR']].to_string(index=False))

# Find optimal threshold based on F1-score
optimal_row = results_df.iloc[results_df['F1-Score'].idxmax()]
print(f"\nOptimal Threshold (Max F1-Score): {optimal_row['Threshold']:.2f}")
print(f"At this threshold:")
print(f"- Recall: {optimal_row['Recall']:.4f}")
print(f"- Precision: {optimal_row['Precision']:.4f}")
print(f"- F1-Score: {optimal_row['F1-Score']:.4f}")
print(f"- False Positive Rate: {optimal_row['FPR']:.4f}")