In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Load the dataset
data = pd.read_csv("creditcard.csv")

# Define the target variable and feature set
X = data.drop(['Class'], axis=1)
Y = data["Class"]

# Perform data preprocessing
X = X.drop_duplicates() # remove duplicate records
X = X.fillna(X.mean()) # impute missing values with column means
scaler = StandardScaler()
X = scaler.fit_transform(X) # perform feature scaling

# Resample the data using SMOTE to balance the class distribution
smote = SMOTE(random_state=42)
X, Y = smote.fit_resample(X, Y)

# Split the data into training and testing sets
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train a random forest classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(xTrain, yTrain)

# Make predictions on the test set
yPred = rfc.predict(xTest)

# Evaluate the performance of the model
acc = accuracy_score(yTest, yPred)
prec = precision_score(yTest, yPred)
rec = recall_score(yTest, yPred)
f1 = f1_score(yTest, yPred)
MCC = matthews_corrcoef(yTest, yPred)

print("The model used is Random Forest classifier")
print("The accuracy is {}".format(acc))
print("The precision is {}".format(prec))
print("The recall is {}".format(rec))
print("The F1-Score is {}".format(f1))
print("The Matthews correlation coefficient is {}".format(MCC))

# Print the confusion matrix
conf_matrix = confusion_matrix(yTest, yPred)
print("Confusion Matrix:\n", conf_matrix)

KeyboardInterrupt: 