In [None]:
# ========================
# Credit Card Fraud Detection
# Tasks C-1 to C-6
# ========================

# --- 1. Import Libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Sthepa 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import joblib

sns.set_style("whitegrid")

# --- 2. Load Data ---
print("Loading dataset...")
df = pd.read_csv("../data/creditcard.csv")
print("Dataset shape:", df.shape)
display(df.head())

# --- 3. Exploratory Data Analysis (C-1) ---
print("\nDataset info:")
df.info()

print("\nMissing values per column:\n", df.isna().sum().sum())
print("Duplicates:", df.duplicated().sum())

fraud = df[df["Class"] == 1]
valid = df[df["Class"] == 0]
print("Fraud cases:", len(fraud))
print("Valid cases:", len(valid))
print("Fraud proportion: {:.4f}%".format(len(fraud)/len(df)*100))

sns.countplot(x="Class", data=df)
plt.title("Fraud (1) vs Valid (0)")
plt.show()

# --- 4. Preprocessing ---
# Scale Amount and (optionally) Time
scaler = StandardScaler()
df["Amount_scaled"] = scaler.fit_transform(df[["Amount"]])
if "Time" in df.columns:
    df["Time_scaled"] = scaler.fit_transform(df[["Time"]])

# Features and target
features = [c for c in df.columns if c not in ["Class", "Amount", "Time"]]
X = df[features]
y = df["Class"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# --- 5. Baseline Model: Logistic Regression (C-2) ---
baseline_lr = LogisticRegression(max_iter=2000)
baseline_lr.fit(X_train, y_train)

y_pred = baseline_lr.predict(X_test)
y_prob = baseline_lr.predict_proba(X_test)[:,1]

print("\n=== Baseline Logistic Regression ===")
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Valid","Fraud"], yticklabels=["Valid","Fraud"])
plt.title("Confusion Matrix - Baseline Logistic Regression")
plt.show()

# --- 6. Handle Imbalance with SMOTE (C-3) ---
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("\nBefore SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_train_sm))

lr_sm = LogisticRegression(max_iter=2000)
lr_sm.fit(X_train_sm, y_train_sm)

y_pred_sm = lr_sm.predict(X_test)
y_prob_sm = lr_sm.predict_proba(X_test)[:,1]

print("\n=== Logistic Regression + SMOTE ===")
print(classification_report(y_test, y_pred_sm, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_prob_sm))

# --- 7. Advanced Model: Random Forest (C-4) ---
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train_sm, y_train_sm)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

print("\n=== Random Forest + SMOTE ===")
print(classification_report(y_test, y_pred_rf, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

# --- 8. Comparison of Models (C-5) ---
def summarize(name, y_true, y_pred, y_prob):
    return {
        "model": name,
        "roc_auc": roc_auc_score(y_true, y_prob),
        "report": classification_report(y_true, y_pred, output_dict=True)
    }

results = [
    summarize("Baseline LR", y_test, y_pred, y_prob),
    summarize("LR + SMOTE", y_test, y_pred_sm, y_prob_sm),
    summarize("RF + SMOTE", y_test, y_pred_rf, y_prob_rf),
]

pd.DataFrame([{
    "Model": r["model"],
    "ROC AUC": r["roc_auc"],
    "Recall (Fraud)": r["report"]["1"]["recall"],
    "Precision (Fraud)": r["report"]["1"]["precision"],
    "F1-score (Fraud)": r["report"]["1"]["f1-score"]
} for r in results])

# --- 9. Save Final Model (C-6) ---
joblib.dump(rf, "../models/final_rf.joblib")
joblib.dump(scaler, "../models/amount_time_scaler.joblib")
print("Final Random Forest model saved to ../models/final_rf.joblib")

ModuleNotFoundError: No module named 'matplotlib'