In [None]:
#
# ### 1. Importing Libraries
# These are the libraries we will be using for data manipulation, visualization, and machine learning.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, PrecisionRecallDisplay
from sklearn.metrics import precision_recall_curve, average_precision_score
from imblearn.over_sampling import SMOTE
import joblib

sns.set(style="whitegrid")


# ### 2. Loading the Data
print("Loading the dataset...")
try:
    df = pd.read_csv(r'../data/creditcard.csv') 
    display(df.head())
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: The dataset file was not found. Please check the file path.")
    print("Please ensure you have downloaded the dataset and placed it in the correct directory.")

print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
display(df.describe())

# proportion of transactions that are fraud
fraud = df[df['Class'] == 1]
valid = df[df['Class'] == 0]

fractional_value = len(fraud)/(len(valid))
print("Fractional value:", fractional_value)

print("Fraud Cases detected: {}".format(len(df[df['Class'] == 1])))
print("Valid Transactions: {}".format(len(df[df['Class'] == 0])))
print("Total Transactions: {}".format(len(df)))


# === Missing Values & Duplicates ===
print("\nMissing values per column:\n", df.isna().sum())
print("Duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)
if df.isna().sum().sum() > 0:
    df = df.dropna()
    print("After dropping NA:", df.shape)


# === Dataset Info ===
print("\nDataset info:")
df.info()


# === Class Balance ===
fraud_counts = df['Class'].value_counts()
transaction = len(df)

fraud_percentage = (fraud_counts.get(1, 0) / transaction) * 100
print(f"Percentage of fraudulent transactions: {fraud_percentage:.4f}%")
print("This confirms a severe class imbalance, which must be addressed in our modeling.")

sns.countplot(x='Class', data=df)
plt.title("Class Distribution (0: No Fraud, 1: Fraud)")
plt.show()
print(df["Class"].value_counts(normalize=True))


# === Feature Distributions ===
plt.figure(figsize=(8,5))
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(df['Time'], bins=50, kde=False)
plt.title("Transaction Time Distribution")
plt.show()


# === Correlation Heatmap ===
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap of Features")
plt.show()