In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from pylab import rcParams

rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [None]:
# Load the dataset
dataset = pd.read_csv('C:/Users/Dell/PycharmProjects/CreditCard Fraud Detection/creditcard.csv')


In [None]:
## Handling missing values and encoding categorical values

# Identifying missing values
missingValues = dataset.isnull().sum()
print(missingValues)

# Removing rows with missing values
print(dataset.dropna())

# since the data contain only numerical inputs, encoding categorial values in not necessary

In [None]:
## Exploratory Data Analysis (EDA)

# Understanding the Data
print(dataset.head())
print(dataset.tail())

In [None]:
# Get summary information about the dataset
print(dataset.info())

In [None]:
# Transaction class distribution
count_classes = pd.value_counts(dataset['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show();


In [None]:
# Get the Fraud and the normal dataset
fraud = dataset[dataset['Class']==1]
normal = dataset[dataset['Class']==0]
print(fraud.shape,normal.shape)

In [None]:
## Fraud and Normal transaction information Description
fraud.Amount.describe()

In [None]:
normal.Amount.describe()

In [None]:
# Amount per Transaction by Class
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

In [None]:
# We Will check Do fraudulent transactions occur more often during certain time frame ? Let us find out with a visual representation.

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

In [None]:
# Examining the correlation between different features and the target variable

correlation = dataset.corr()['Class'].sort_values()

# Print correlation coefficients
print(correlation)

# Plot correlation matrix as a heatmap
corrFeature = correlation.index
plt.figure(figsize=(20, 20))
sns.heatmap(dataset[corrFeature].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
## Dimenionality Reduction by Principle Component Analysis method

# Separate features and target variable
X = dataset.drop('Class', axis=1)
y = dataset['Class']

# Center the data
X_centered = X - np.mean(X, axis=0)

# Calculate the covariance matrix
cov_matrix = np.cov(X_centered.T)

# Calculate the eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort the eigenvalues in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]

# Select the top k eigenvectors based on the desired number of components
num_components = 10
selected_eigenvectors = eigenvectors[:, sorted_indices[:num_components]]

# Project the data onto the selected eigenvectors
X_reduced = np.dot(X_centered, selected_eigenvectors)
print(X_reduced)

In [None]:
# Create an instance of the DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()

# Get the list of available parameters
params = dt_classifier.get_params().keys()

# Print the list of available parameters
print(params)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy']
}

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train different classifiers with hyperparameter tuning
classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

for name, classifier in classifiers:
    if name == 'Logistic Regression':
        param_grid = {
            'classifier__C': [0.1, 1, 10]
        }
    else:
        param_grid = {
            'classifier__max_depth': [None, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__criterion': ['gini', 'entropy']
        }

    # Create a pipeline for the classifier
    pipeline = Pipeline([
        ('classifier', classifier)
    ])

    # Perform hyperparameter tuning using RandomizedSearchCV
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, n_iter=3, random_state=42)
    random_search.fit(X_train, y_train)

    # Evaluate the best model on the validation set
    best_model = random_search.best_estimator_
    val_accuracy = best_model.score(X_val, y_val)

    # Evaluate the best model on the testing set
    test_accuracy = best_model.score(X_test, y_test)

    # Print the results
    print(f'{name}:')
    print(f'Validation Accuracy: {val_accuracy:.4f}')
    print(f'Testing Accuracy: {test_accuracy:.4f}')
    print('---')

In [None]:
# Define a list to store the results of each model
results = []

# Iterate over the trained models
for name, classifier in classifiers:
    # Predict labels on the test set
    y_pred = best_model.predict(X_test)

    # Calculate classification metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Store the results in a dictionary
    result = {
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

    # Append the result to the results list
    results.append(result)

# Print the results
for result in results:
    print(f"Model: {result['Model']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"Precision: {result['Precision']:.4f}")
    print(f"Recall: {result['Recall']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")
    print(f"ROC AUC: {result['ROC AUC']:.4f}")
    print("---")

# Compare the results and identify the most effective classifier
best_model_result = max(results, key=lambda x: x['Accuracy'])
print("Best Model:")
print(f"Model: {best_model_result['Model']}")
print(f"Accuracy: {best_model_result['Accuracy']:.4f}")
print(f"Precision: {best_model_result['Precision']:.4f}")
print(f"Recall: {best_model_result['Recall']:.4f}")
print(f"F1 Score: {best_model_result['F1 Score']:.4f}")
print(f"ROC AUC: {best_model_result['ROC AUC']:.4f}")

In [None]:
# Extract the performance metrics from the results
models = [result['Model'] for result in results]
accuracies = [result['Accuracy'] for result in results]
precisions = [result['Precision'] for result in results]
recalls = [result['Recall'] for result in results]
f1_scores = [result['F1 Score'] for result in results]
roc_aucs = [result['ROC AUC'] for result in results]

# Plot the performance metrics
plt.figure(figsize=(10, 6))
plt.plot(models, accuracies, label='Accuracy')
plt.plot(models, precisions, label='Precision')
plt.plot(models, recalls, label='Recall')
plt.plot(models, f1_scores, label='F1 Score')
plt.plot(models, roc_aucs, label='ROC AUC')
plt.xlabel('Models')
plt.ylabel('Performance')
plt.title('Performance Comparison of Different Models')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:

# Extract the performance metrics from the results
models = [result['Model'] for result in results]
accuracies = [result['Accuracy'] for result in results]
precisions = [result['Precision'] for result in results]
recalls = [result['Recall'] for result in results]
f1_scores = [result['F1 Score'] for result in results]
roc_aucs = [result['ROC AUC'] for result in results]

# Plot the performance metrics
plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, label='Accuracy')
plt.bar(models, precisions, label='Precision')
plt.bar(models, recalls, label='Recall')
plt.bar(models, f1_scores, label='F1 Score')
plt.bar(models, roc_aucs, label='ROC AUC')
plt.xlabel('Models')
plt.ylabel('Performance')
plt.title('Performance Comparison of Different Models')
plt.legend()
plt.xticks(rotation=45)
plt.show()