In [None]:
#
# ### 1. Importing Libraries
# These are the libraries we will be using for data manipulation, visualization, and machine learning.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, PrecisionRecallDisplay
from sklearn.metrics import precision_recall_curve, average_precision_score
from imblearn.over_sampling import SMOTE
import joblib

sns.set(style="whitegrid")


# ### 2. Loading the Data
print("Loading the dataset...")
try:
    df = pd.read_csv(r'../data/creditcard.csv') 
    display(df.head())
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: The dataset file was not found. Please check the file path.")
    print("Please ensure you have downloaded the dataset and placed it in the correct directory.")

print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
display(df.describe())

# proportion of transactions that are fraud
fraud = df[df['Class'] == 1]
valid = df[df['Class'] == 0]

fractional_value = len(fraud)/(len(valid))
print("Fractional value:", fractional_value)

print("Fraud Cases detected: {}".format(len(df[df['Class'] == 1])))
print("Valid Transactions: {}".format(len(df[df['Class'] == 0])))
print("Total Transactions: {}".format(len(df)))


# === Missing Values & Duplicates ===
print("\nMissing values per column:\n", df.isna().sum())
print("Duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)
if df.isna().sum().sum() > 0:
    df = df.dropna()
    print("After dropping NA:", df.shape)


# === Dataset Info ===
print("\nDataset info:")
df.info()


# === Class Balance ===
fraud_counts = df['Class'].value_counts()
transaction = len(df)

fraud_percentage = (fraud_counts.get(1, 0) / transaction) * 100
print(f"Percentage of fraudulent transactions: {fraud_percentage:.4f}%")
print("This confirms a severe class imbalance, which must be addressed in our modeling.")

sns.countplot(x='Class', data=df)
plt.title("Class Distribution (0: No Fraud, 1: Fraud)")
plt.show()
print(df["Class"].value_counts(normalize=True))


# === Feature Distributions ===
plt.figure(figsize=(8,5))
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(df['Time'], bins=50, kde=False)
plt.title("Transaction Time Distribution")
plt.show()


# === Correlation Heatmap ===
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap of Features")
plt.show()

In [None]:
# ...existing code...
# --- C-2: Baseline Model — Logistic Regression (train & evaluate on original imbalanced data) ---

# Ensure X, y and a train/test split exist; create if missing
if 'X_train' not in globals() or 'X_test' not in globals():
    features = [c for c in df.columns if c not in ['Class', 'Amount', 'Time']]
    X = df[features]
    y = df['Class']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    print("Created train/test split:", X_train.shape, X_test.shape)
else:
    print("Using existing train/test split.")

# Train baseline logistic regression (no resampling here)
baseline_lr = LogisticRegression(max_iter=1000,solver="saga", random_state=42)
baseline_lr.fit(X_train, y_train)

# Predictions & probabilities
y_pred = baseline_lr.predict(X_test)
y_prob = baseline_lr.predict_proba(X_test)[:, 1]

# Metrics
print("\n=== Baseline Logistic Regression Evaluation ===")
print(classification_report(y_test, y_pred, digits=4))

roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {roc_auc:.4f}")

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Valid (0)","Fraud (1)"], yticklabels=["Valid (0)","Fraud (1)"])
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title("Confusion Matrix — Baseline LR")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.4f}')
plt.plot([0,1],[0,1], 'k--', alpha=0.6)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Baseline LR")
plt.legend()
plt.tight_layout()
plt.show()

# Precision-Recall curve and average precision
precision, recall, _ = precision_recall_curve(y_test, y_prob)
avg_prec = average_precision_score(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(recall, precision, label=f'AP = {avg_prec:.4f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve — Baseline LR")
plt.legend()
plt.tight_layout()
plt.show()

print("\nNotes:")
print("- This baseline is trained on the original imbalanced data (no SMOTE/weighting).")
print("- Use these results as a reference when you apply SMOTE or class-weighting in later tasks (C-3, C-4).")

print("          ---Task C-2 complete---")

In [None]:
from imblearn.over_sampling import SMOTE

print("\n=== C-3: Logistic Regression with SMOTE ===")

# Apply SMOTE only on the training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Train model
lr_smote = LogisticRegression(max_iter=2000, random_state=42)
lr_smote.fit(X_train_sm, y_train_sm)

# Predict on original test set
y_pred_sm = lr_smote.predict(X_test)
y_prob_sm = lr_smote.predict_proba(X_test)[:, 1]

# Metrics
print(classification_report(y_test, y_pred_sm))
print("ROC AUC:", roc_auc_score(y_test, y_prob_sm))
print("Average Precision:", average_precision_score(y_test, y_prob_sm))

# Optional: Plot PR curve
precision, recall, _ = precision_recall_curve(y_test, y_prob_sm)
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (SMOTE)")
plt.show()

print("          ---Task C-3 complete---")

In [None]:
print("\n=== C-4: Random Forest with SMOTE ===")

# Balance training set
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight="balanced")
rf.fit(X_train_sm, y_train_sm)

# Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:,1]

# Metrics
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Average Precision:", average_precision_score(y_test, y_prob))

# Curves
fpr,tpr,_=roc_curve(y_test,y_prob)
plt.plot(fpr,tpr,label=f'ROC AUC={roc_auc_score(y_test,y_prob):.4f}')
plt.plot([0,1],[0,1],'k--');plt.legend();plt.show()

precision,recall,_=precision_recall_curve(y_test,y_prob)
plt.plot(recall,precision,label=f'AP={average_precision_score(y_test,y_prob):.4f}')
plt.legend();plt.show()

print("          ---Task C-4 complete---")

In [None]:
# === C-5: Final Performance Evaluation ===
print("\n=== C-5: Final Model Evaluation (Random Forest vs Baseline) ===")

# Baseline metrics (from C-2)
baseline_report = classification_report(y_test, y_pred, digits=4, output_dict=True)

# Random Forest metrics (from C-4)
rf_report = classification_report(y_test, y_pred, digits=4, output_dict=True)

# Compare recall for fraud class (label = 1)
print(f"Baseline Recall (Fraud=1): {baseline_report['1']['recall']:.4f}")
print(f"Random Forest Recall (Fraud=1): {rf_report['1']['recall']:.4f}")
print(f"Improvement: {rf_report['1']['recall'] - baseline_report['1']['recall']:.4f}")

print("\nFull Report — Random Forest:")
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Average Precision:", average_precision_score(y_test, y_prob))


# === C-6: Save Final Model ===
print("\n=== C-6: Saving Final Trained Model ===")
joblib.dump(rf, "final_rf_model.pkl")
print("Model saved as 'final_rf_model.pkl'")