# Credit Card Fraud Detection
This notebook performs detailed EDA and applies multiple Machine Learning models
on an imbalanced credit card transaction dataset, with hyperparameter tuning and ROC curve analysis.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, recall_score, f1_score, roc_curve, auc
from imblearn.over_sampling import RandomOverSampler

dataset = pd.read_csv('creditcard.csv')
print(dataset.shape)
print(dataset.isna().sum())
dataset.head()

In [None]:
# EDA
sns.countplot(dataset['Class'])
plt.title('Class Distribution')
plt.show()

plt.figure(figsize=(10,10))
sns.heatmap(dataset.corr(), vmax=0.8, square=True)
plt.title('Feature Correlation')
plt.show()

In [None]:
# Data Preparation
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Handle Imbalance
ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X, y)
print('Class Distribution after Oversampling:', np.bincount(y_res))

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [None]:
# Model Training
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [None]:
# Hyperparameter Tuning for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=1, scoring='recall')
grid.fit(X_train, y_train)

print('Best Parameters:', grid.best_params_)
print('Best Recall Score:', grid.best_score_)

In [None]:
# ROC Curve Plotting
plt.figure(figsize=(10, 6))

for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

# Adding SVM GridSearch model
y_prob_grid = grid.best_estimator_.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob_grid)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'SVM (Tuned) AUC = {roc_auc:.2f}', linestyle='--')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()