In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    classification_report
)



Could not save font_manager cache Lock error: Matplotlib failed to acquire the following lock file:
    C:\Users\deepa\.matplotlib\fontlist-v390.json.matplotlib-lock
This maybe due to another process holding this lock file.  If you are sure no
other Matplotlib process is running, remove this file and try again.


In [2]:
# ------------------ 1) Load dataset ------------------
path = "creditcard_2023.csv"
df = pd.read_csv(path)



In [3]:
# ------------------ 2) Detect target column ------------------
preferred_names = ['Class', 'class', 'Fraud', 'fraud', 'isFraud', 'IsFraud', 'target', 'Target', 'label', 'Label', 'y']

target_col = None
for name in preferred_names:
    if name in df.columns:
        target_col = name
        break



In [4]:
# If not found by name, try to infer a binary target column
def find_binary_target(dataframe):
    # candidate binary columns
    binary_cols = []
    for col in dataframe.columns:
        unique_vals = dataframe[col].dropna().unique()
        if len(unique_vals) == 2:
            # numeric or boolean preferred
            if pd.api.types.is_numeric_dtype(dataframe[col]) or pd.api.types.is_bool_dtype(dataframe[col]):
                binary_cols.append(col)
    if not binary_cols:
        return None
    # pick the most imbalanced binary column (fraud tends to be minority)
    best_col, best_ratio = None, 1.0
    for col in binary_cols:
        val_counts = dataframe[col].value_counts(normalize=True, dropna=True)
        # minority class proportion
        minority = val_counts.min()
        if minority < best_ratio:
            best_ratio = minority
            best_col = col
    return best_col

if target_col is None:
    target_col = find_binary_target(df)

if target_col is None:
    raise ValueError(
        "Could not automatically detect a binary target column. "
        f"Available columns: {list(df.columns)}. "
        "Please rename your label to one of "
        "['Class','Fraud','isFraud','target','label','y'] or provide a binary label column."
    )



In [5]:
# ------------------ Basic info ------------------
report_lines = []
report_lines.append(f"Detected target column: {target_col}")



In [6]:
# ------------------ Handle categorical/non-numeric features (one-hot encode) ------------------
# Exclude the target from dummies
X_df = df.drop(columns=[target_col])
y = df[target_col]



In [7]:
# Fill any missing values before encoding
# For numeric columns: fill with median; for object columns: fill with most frequent
for col in X_df.columns:
    if pd.api.types.is_numeric_dtype(X_df[col]):
        X_df[col] = X_df[col].fillna(X_df[col].median())
    else:
        X_df[col] = X_df[col].fillna(X_df[col].mode().iloc[0] if not X_df[col].mode().empty else "missing")



In [8]:
# One-hot encode non-numeric columns
X_df = pd.get_dummies(X_df, drop_first=True)



In [10]:
# Ensure y is numeric binary {0,1}
if pd.api.types.is_bool_dtype(y):
    y = y.astype(int)
# If labels are strings like 'fraud'/'legit', map to 1/0 by minority class heuristic
if not pd.api.types.is_numeric_dtype(y):
    vc = y.value_counts()
    if len(vc) == 2:
        labels_sorted = vc.index.tolist()
        # minority label becomes 1
        mapping = {labels_sorted[0]: 0, labels_sorted[1]: 1}
        if vc.iloc[0] > vc.iloc[1]:
            # swap so minority gets 1
            mapping = {labels_sorted[0]: 0, labels_sorted[1]: 1}
        else:
            mapping = {labels_sorted[0]: 1, labels_sorted[1]: 0}
        y = y.map(mapping)
    else:
        raise ValueError(f"Target column '{target_col}' is not binary. Unique values: {y.unique()}")

# Verify binary 0/1
unique_y = sorted(pd.Series(y.unique()).dropna().tolist())
if not (len(unique_y) == 2 and set(unique_y) == {0, 1}):
    raise ValueError(f"Target column '{target_col}' is not binary 0/1 after processing. Found: {unique_y}")



In [13]:
# Class balance summary
class_counts = pd.Series(y).value_counts()
report_lines.append("Class distribution:\n" + str(class_counts))

# ------------------ 3) Train/Test split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_df.values, y.values, test_size=0.2, random_state=42, stratify=y.values
)

# ------------------ 4) Create and evaluate model using cross-validation ------------------
classifier = LinearDiscriminantAnalysis()

accuracies = cross_val_score(
    estimator=classifier,
    X=X_train,
    y=y_train,
    cv=10,
)

print("Cross-Validation Accuracy: {:.2f} %".format(accuracies.mean() * 100))
print("Cross-Validation Standard Deviation: {:.2f} %".format(accuracies.std() * 100))


Cross-Validation Accuracy: 98.89 %
Cross-Validation Standard Deviation: 0.03 %
