In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
import shap
import os


In [None]:
# Set up directory paths for 
MODEL_DIR = "/outputs/models"
SHAP_OUTPUT_DIR = "/outputs/shap_outputs"
os.makedirs(SHAP_OUTPUT_DIR, exist_ok=True)

In [None]:
# Fix non-numeric columns and class values in fraud_data
def fix_fraud_data(df):
    # Convert timestamp columns to numeric if present
    for col in ['signup_time', 'purchase_time']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
    if 'signup_time' in df.columns and 'purchase_time' in df.columns:
        df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600.0  # Hours
        df['purchase_hour'] = df['purchase_time'].dt.hour
        df['purchase_day'] = df['purchase_time'].dt.dayofweek
        df = df.drop(columns=['signup_time', 'purchase_time'])

    # Drop non-numeric identifiers
    for col in ['user_id', 'device_id', 'ip_address']:
        if col in df.columns:
            df = df.drop(columns=[col])

    # Encode categorical variables
    categorical_cols = [col for col in df.columns if col not in ['class', 'time_since_signup', 'purchase_hour', 'purchase_day'] and df[col].dtype == 'object']
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Ensure numeric data
    df = df.fillna(df.mean(numeric_only=True))

    # Ensure class is binary {0, 1}
    if 'class' in df.columns:
        df['class'] = df['class'].apply(lambda x: 1 if x != 0 else 0).astype(int)

    return df



In [None]:
# Fix creditcard_data
def fix_creditcard_data(df):
    # Rename 'Class' to 'class' if needed
    df = df.rename(columns={'Class': 'class'})
    # Ensure class is binary {0, 1}
    if 'class' in df.columns:
        df['class'] = df['class'].apply(lambda x: 1 if x != 0 else 0).astype(int)
    # Ensure numeric data
    df = df.fillna(df.mean(numeric_only=True))
    return df



In [None]:
def generate_shap_explanations(model, X, dataset_name):
    # Initialize SHAP explainer for Random Forest
    explainer = shap.TreeExplainer(model)

    # Compute SHAP values
    shap_values = explainer.shap_values(X)

    # For binary classification, use SHAP values for class 1 (fraud)
    shap_values_class1 = shap_values[1]

    # SHAP summary plot
    plt.figure()
    shap.summary_plot(shap_values_class1, X, show=False)
    plt.title(f"SHAP Summary Plot - {dataset_name} (Random Forest)")
    plt.tight_layout()
    plt.savefig(os.path.join(SHAP_OUTPUT_DIR, f"{dataset_name}_shap_summary.png"))
    plt.close()

    # Save SHAP values to CSV
    shap_df = pd.DataFrame(shap_values_class1, columns=X.columns)
    shap_df.to_csv(os.path.join(SHAP_OUTPUT_DIR, f"{dataset_name}_shap_values.csv"), index=False)

    # Generate feature importance (mean absolute SHAP values)
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': np.abs(shap_values_class1).mean(axis=0)
    }).sort_values(by='importance', ascending=False)

    return feature_importance



In [None]:
def write_shap_report(fraud_importance, creditcard_importance=None):
    report = """
    SHAP Explanation Report
    ======================
    This report summarizes the feature importance for fraud detection models using SHAP values.

    Fraud_Data Insights:
    - Top features contributing to fraud prediction (based on mean absolute SHAP values):
    {}
    - Key observations: Features like 'time_since_signup' and 'purchase_hour' likely influence fraud due to temporal patterns in fraudulent behavior.

    creditcard Insights:
    {}
    """
    fraud_text = fraud_importance.head(5).to_string(index=False)
    creditcard_text = "Skipped due to empty dataset or processing error."
    if creditcard_importance is not None:
        creditcard_text = f"- Top features contributing to fraud prediction:\n{creditcard_importance.head(5).to_string(index=False)}\n- Key observations: PCA components (e.g., V1, V2) are critical, indicating complex patterns in fraud detection."

    report = report.format(fraud_text, creditcard_text)

    with open(os.path.join(SHAP_OUTPUT_DIR, "shap_explanation.txt"), "w") as f:
        f.write(report)
    print(report)



In [None]:
# Load preprocessed data
try:
    fraud_data = pd.read_csv('/outputs/processed_fraud_data.csv')
except FileNotFoundError:
    print("processed_fraud_data.csv not found. Please check the path:")
    !ls /kaggle/input/
    raise

creditcard_data_processed = False
try:
    creditcard_data = pd.read_csv('/outputs/processed_creditcard_data.csv')
    if not creditcard_data.empty:
        creditcard_data_processed = True
    else:
        print("Warning: processed_creditcard_data.csv is empty. Attempting to load raw creditcard.csv.")
        try:
            creditcard_data = pd.read_csv('/data/creditcard.csv')
            print("Loaded raw creditcard.csv as fallback.")
            creditcard_data_processed = True
        except FileNotFoundError:
            print("Raw creditcard.csv not found. Skipping creditcard_data processing.")
            creditcard_data = pd.DataFrame()
except FileNotFoundError:
    print("processed_creditcard_data.csv not found. Attempting to load raw creditcard.csv.")
    try:
        creditcard_data = pd.read_csv('/data/creditcard.csv')
        print("Loaded raw creditcard.csv as fallback.")
        creditcard_data_processed = True
    except FileNotFoundError:
        print("Raw creditcard.csv not found. Skipping creditcard_data processing.")
        creditcard_data = pd.DataFrame()



In [None]:
# Fix data
fraud_data = fix_fraud_data(fraud_data)
if creditcard_data_processed:
    creditcard_data = fix_creditcard_data(creditcard_data)

# Verify data types and class values
print("Fraud_Data dtypes:\n", fraud_data.dtypes)
print("\nFraud_Data class values:\n", fraud_data['class'].value_counts())
if creditcard_data_processed:
    print("\ncreditcard dtypes:\n", creditcard_data.dtypes)
    print("\ncreditcard class values:\n", creditcard_data['class'].value_counts())

# Load Random Forest models
fraud_model = None
creditcard_model = None
try:
    fraud_model = joblib.load(os.path.join(MODEL_DIR, "fraud_data_random_forest_model.pkl"))
except FileNotFoundError:
    print("fraud_data_random_forest_model.pkl not found. Check Task 2 outputs.")
if creditcard_data_processed:
    try:
        creditcard_# Load preprocessed data
try:
    fraud_data = pd.read_csv('/outputs/processed_fraud_data.csv')
except FileNotFoundError:
    print("processed_fraud_data.csv not found. Please check the path:")
    raise


In [None]:
creditcard_data_processed = False
try:
    creditcard_data = pd.read_csv('/outputs/processed_creditcard_data.csv')
    if not creditcard_data.empty:
        creditcard_data_processed = True
    else:
        print("Warning: processed_creditcard_data.csv is empty. Attempting to load raw creditcard.csv.")
        try:
            creditcard_data = pd.read_csv('/data/creditcard.csv')
            print("Loaded raw creditcard.csv as fallback.")
            creditcard_data_processed = True
        except FileNotFoundError:
            print("Raw creditcard.csv not found. Skipping creditcard_data processing.")
            creditcard_data = pd.DataFrame()
except FileNotFoundError:
    print("processed_creditcard_data.csv not found. Attempting to load raw creditcard.csv.")
    try:
        creditcard_data = pd.read_csv('/data/creditcard.csv')
        print("Loaded raw creditcard.csv as fallback.")
        creditcard_data_processed = True
    except FileNotFoundError:
        print("Raw creditcard.csv not found. Skipping creditcard_data processing.")
        creditcard_data = pd.DataFrame()



In [None]:
# Fix data
fraud_data = fix_fraud_data(fraud_data)
if creditcard_data_processed:
    creditcard_data = fix_creditcard_data(creditcard_data)

# Verify data types and class values
print("Fraud_Data dtypes:\n", fraud_data.dtypes)
print("\nFraud_Data class values:\n", fraud_data['class'].value_counts())
if creditcard_data_processed:
    print("\ncreditcard dtypes:\n", creditcard_data.dtypes)
    print("\ncreditcard class values:\n", creditcard_data['class'].value_counts())

# Load Random Forest models
fraud_model = None
creditcard_model = None
try:
    fraud_model = joblib.load(os.path.join(MODEL_DIR, "fraud_data_random_forest_model.pkl"))
except FileNotFoundError:
    print("fraud_data_random_forest_model.pkl not found. Check Task 2 outputs.")
if creditcard_data_processed:
    try:
        creditcard_model = joblib.load(os.path.join(MODEL_DIR, "creditcard_random_forest_model.pkl"))
    except FileNotFoundError:
        print("creditcard_random_forest_model.pkl not found. Skipping creditcard_data SHAP analysis.")


In [None]:
# Generate SHAP explanations
fraud_importance = None
creditcard_importance = None
if fraud_model is not None and not fraud_data.empty:
    try:
        fraud_X = fraud_data.drop(columns=['class'])
        fraud_importance = generate_shap_explanations(fraud_model, fraud_X, "fraud_data")
    except Exception as e:
        print(f"Error generating SHAP for fraud_data: {e}")

if creditcard_model is not None and creditcard_data_processed and not creditcard_data.empty:
    try:
        creditcard_X = creditcard_data.drop(columns=['class'])
        creditcard_importance = generate_shap_explanations(creditcard_model, creditcard_X, "creditcard")
    except Exception as e:
        print(f"Error generating SHAP for creditcard_data: {e}")

# Write SHAP report
if fraud_importance is not None:
    write_shap_report(fraud_importance, creditcard_importance)
else:
    print("Error: No SHAP explanations generated. Check input data and models.")
model = joblib.load(os.path.join(MODEL_DIR, "creditcard_random_forest_model.pkl"))
    except FileNotFoundError:
        print("creditcard_random_forest_model.pkl not found. Skipping creditcard_data SHAP analysis.")


In [None]:
# Generate SHAP explanations
fraud_importance = None
creditcard_importance = None
if fraud_model is not None and not fraud_data.empty:
    try:
        fraud_X = fraud_data.drop(columns=['class'])
        fraud_importance = generate_shap_explanations(fraud_model, fraud_X, "fraud_data")
    except Exception as e:
        print(f"Error generating SHAP for fraud_data: {e}")

if creditcard_model is not None and creditcard_data_processed and not creditcard_data.empty:
    try:
        creditcard_X = creditcard_data.drop(columns=['class'])
        creditcard_importance = generate_shap_explanations(creditcard_model, creditcard_X, "creditcard")
    except Exception as e:
        print(f"Error generating SHAP for creditcard_data: {e}")

# Write SHAP report
if fraud_importance is not None:
    write_shap_report(fraud_importance, creditcard_importance)
else:
    print("Error: No SHAP explanations generated. Check input data and models.")
