In [None]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import time

# Load the dataset into a DataFrame
data = pd.read_csv('creditcard_2023.csv')

# Display basic information about the dataset
data.info()

# Check for missing values in each column
data.isnull().sum()

# Drop the 'id' column as it is not useful for the analysis
data.drop(columns=['id'], inplace=True)

# Plot the distribution of the 'Class' variable
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=data)
plt.title('Class Distribution')
plt.show()

# Plot the distribution of the 'Amount' variable
plt.figure(figsize=(6,4))
sns.histplot(data['Amount'], bins=50)
plt.title('Transaction Amount Distribution')
plt.show()

# Plot a heatmap of the correlations between variables
plt.figure(figsize=(12,8))
corr = data.corr()
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scale the 'Amount' column to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])

# Create new features based on 'user_id' if it exists in the dataset
if 'user_id' in data.columns:
    data['Transaction_Frequency'] = data.groupby('user_id')['Amount'].transform('count')
else:
    data['Transaction_Frequency'] = 1  # If 'user_id' does not exist, set default value

if 'user_id' in data.columns:
    data['Mean_Transaction_Amount'] = data.groupby('user_id')['Amount'].transform('mean')
else:
    data['Mean_Transaction_Amount'] = data['Amount']  # If 'user_id' does not exist, use 'Amount'

# Create temporal features based on 'Time' if it exists in the dataset
if 'Time' in data.columns:
    data['Hour'] = (data['Time'] / 3600).astype(int) % 24
    data['Time_Since_Last_Transaction'] = data.groupby('user_id')['Time'].diff().fillna(0)
else:
    data['Hour'] = 0  # If 'Time' does not exist, set default value
    data['Time_Since_Last_Transaction'] = 0  # If 'Time' does not exist, set default value

# Create merchant-related features if 'Merchant_ID' exists in the dataset
if 'Merchant_ID' in data.columns:
    data['Merchant_Transaction_Count'] = data.groupby('Merchant_ID')['Amount'].transform('count')
else:
    data['Merchant_Transaction_Count'] = 1  # If 'Merchant_ID' does not exist, set default value

# Separate features (X) and target variable (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Define a ColumnTransformer for preprocessing the data
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), ['Amount', 'Transaction_Frequency', 'Mean_Transaction_Amount', 'Hour',
                                     'Time_Since_Last_Transaction', 'Merchant_Transaction_Count'])],
    remainder='passthrough'  # Keep all other columns as they are
)

# Define PCA for dimensionality reduction, retaining 95% of the variance
pca = PCA(n_components=0.95)

# Define a pipeline for Logistic Regression
logistic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define a pipeline for Random Forest
random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define a pipeline for Neural Network
neural_network_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('classifier', MLPClassifier(random_state=42))
])

# Store the pipelines in a dictionary
pipelines = {
    'Logistic Regression (PCA)': logistic_pipeline,
    'Random Forest (PCA)': random_forest_pipeline,
    'Neural Network (PCA)': neural_network_pipeline
}

# Dictionary to store the results of each model
results = {}

# Train and evaluate each model
for name, pipeline in pipelines.items():
    start_time = time.time()
    pipeline.fit(X_train, y_train)  # Train the model
    train_time = time.time() - start_time  # Calculate training time
    
    start_time = time.time()
    predictions = pipeline.predict(X_test)  # Make predictions on the test set
    predict_time = time.time() - start_time  # Calculate prediction time
    
    auc_roc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])  # Calculate AUC-ROC score
    report = classification_report(y_test, predictions)  # Generate classification report
    
    results[name] = {
        'classification_report': report,
        'AUC-ROC': auc_roc,
        'train_time': train_time,
        'predict_time': predict_time
    }

# Plot the ROC curve for each model
plt.figure()
for name in results:
    fpr, tpr, _ = roc_curve(y_test, pipelines[name].predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr, label=f'{name} (area = {results[name]["AUC-ROC"]:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Plot the diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Plot the confusion matrix for each model
for name in results:
    cm = confusion_matrix(y_test, pipelines[name].predict(X_test))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'Confusion Matrix - {name}')
    plt.show()

# Print the results for each model
for name, result in results.items():
    print(f"{name}\n"
          f"Classification Report:\n{result['classification_report']}\n"
          f"AUC-ROC: {result['AUC-ROC']}\n"
          f"Training Time: {result['train_time']:.4f} seconds\n"
          f"Prediction Time: {result['predict_time']:.4f} seconds\n")
