In [6]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the experimental dataset (with "Class" column)
df_exp = pd.read_csv('../data/creditcard.csv')

# Drop duplicates
df_exp = df_exp.drop_duplicates()

# Normalizing the "Amount" column
scaler = StandardScaler()
df_exp["Normalized_Amount"] = scaler.fit_transform(df_exp["Amount"].values.reshape(-1, 1))

# Drop the original "Amount" column (optional)
df_exp = df_exp.drop(["Amount"], axis=1)

# Fetching the target feature ("Class")
Y_exp = df_exp["Class"]

# Fetching the independent features (dropping "Class")
X_exp = df_exp.drop(["Class"], axis=1)

# Handle class imbalance using SMOTE
X_balance, Y_balance = SMOTE().fit_resample(X_exp, Y_exp)

# Split the experimental data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_balance, Y_balance, test_size=1/3, random_state=42)

# -------- Decision Tree Model --------

# Initialize and train the decision tree classifier
dt_model_exp = DecisionTreeClassifier(random_state=42)
dt_model_exp.fit(X_train, Y_train)
print('Decision Tree Model trained.')

# Load my dummy dataset (without "Class" column)
df = pd.read_csv('../data/credit_card_fraud_data.csv')

# Drop duplicates
df = df.drop_duplicates()

# Normalizing the "Amount" column
df["Normalized_Amount"] = scaler.transform(df["Amount"].values.reshape(-1, 1))

# Drop the "Amount" column
df = df.drop(columns=['Amount'])

# Now predict the "Class" for your dataset using the trained decision tree model
predicted_class = dt_model_exp.predict(df[X_exp.columns])

# Add the predicted "Class" as a new column to your dataset
df['Predicted_Class_DT'] = predicted_class

# Simulate true labels based on some condition (same threshold for both models)
threshold = 5.0  # Example threshold; adjust based on your logic
df['True_Class'] = (df['Normalized_Amount'] > threshold).astype(int)

# Evaluate the Decision Tree model
dt_accuracy = accuracy_score(df['True_Class'], predicted_class)
dt_precision = precision_score(df['True_Class'], predicted_class)
dt_recall = recall_score(df['True_Class'], predicted_class)
dt_f1 = f1_score(df['True_Class'], predicted_class)

# -------- KNN Model --------

# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree", n_jobs=-1)
knn.fit(X_train, Y_train)
print('KNN Model trained.')

# Predict the "Class" for your dummy dataset using the trained KNN model
knn_predicted_class = knn.predict(df[X_exp.columns])

# Add the predicted "Class" as a new column to your dummy dataset
df['Predicted_Class_KNN'] = knn_predicted_class

# Evaluate the KNN model
knn_accuracy = accuracy_score(df['True_Class'], knn_predicted_class)
knn_precision = precision_score(df['True_Class'], knn_predicted_class)
knn_recall = recall_score(df['True_Class'], knn_predicted_class)
knn_f1 = f1_score(df['True_Class'], knn_predicted_class)

# -------- Comparison --------

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Decision Tree": [dt_accuracy, dt_precision, dt_recall, dt_f1],
    "KNN": [knn_accuracy, knn_precision, knn_recall, knn_f1]
})

# Print the comparison table
print("\nComparison of Decision Tree and KNN:")
comparison_df

Decision Tree Model trained.
KNN Model trained.

Comparison of Decision Tree and KNN:


Unnamed: 0,Metric,Decision Tree,KNN
0,Accuracy,0.968,0.967
1,Precision,0.461538,0.25
2,Recall,0.193548,0.032258
3,F1-score,0.272727,0.057143


In [8]:
# Final Comparison Conclusion
print("\nFinal Model Comparison Conclusion:")

# Compare based on F1-score to determine the better model
if knn_f1 > dt_f1:
    best_model = "KNN"
else:
    best_model = "Decision Tree"

print(f"Based on the evaluation metrics, the {best_model} model performs better overall for this credit card fraud detection task.")


Final Model Comparison Conclusion:
Based on the evaluation metrics, the Decision Tree model performs better overall for this credit card fraud detection task.
