In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ===== Load Unseen Data =====
fraud_dataset = pd.read_csv('../../dataset/fraud.csv', skiprows=range(1, 220000) , nrows=300000)

# ===== Feature Engineering Functions =====
def add_transaction_speed(df):
    df = df.sort_values(by=["nameOrig", "step"])  
    df["time_diff"] = df.groupby("nameOrig")["step"].diff().fillna(0)
    return df

def add_balance_change_ratio(df):
    df["balance_change_ratio"] = (df["oldbalanceOrg"] - df["newbalanceOrig"]) / (df["oldbalanceOrg"] + 1e-9)
    return df

def add_amount_vs_median(df):
    median_amount = df.groupby("nameOrig")["amount"].median()
    df["median_amount"] = df["nameOrig"].map(median_amount)
    df["amount_vs_median"] = df["amount"] / (df["median_amount"] + 1e-9)
    return df

# ===== Apply Feature Engineering =====
fraud_dataset = add_transaction_speed(fraud_dataset)
fraud_dataset = add_balance_change_ratio(fraud_dataset)
fraud_dataset = add_amount_vs_median(fraud_dataset)

# Drop helper columns
fraud_dataset.drop(columns=["median_amount"], inplace=True)

# ===== One-Hot Encoding =====
fraud_dataset = pd.get_dummies(fraud_dataset, columns=['type'], prefix='type')

# ===== Label Encoding =====
encoder_org = LabelEncoder()
encoder_dest = LabelEncoder()

fraud_dataset['accountOrig'] = encoder_org.fit_transform(fraud_dataset['nameOrig'])
fraud_dataset['accountDest'] = encoder_dest.fit_transform(fraud_dataset['nameDest'])

# Drop original string columns
fraud_dataset.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# Drop unnecessary columns
fraud_dataset.drop(columns=['step', 'isFlaggedFraud'], inplace=True)

# ===== Align Features with Training Set =====
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# ✅ Align feature columns with training set
expected_features = loaded_model.feature_names_in_

# Add missing columns as 0
missing_cols = set(expected_features) - set(fraud_dataset.columns)
for col in missing_cols:
    fraud_dataset[col] = 0

# Drop extra columns not in model
fraud_dataset = fraud_dataset[expected_features]

# ===== Standardize Data (Use the same scaler) =====
scaler = StandardScaler()
fraud_dataset_scaled = scaler.fit_transform(fraud_dataset)

# ===== Predict Using the Loaded Model =====
predictions = loaded_model.predict(fraud_dataset_scaled)
probabilities = loaded_model.predict_proba(fraud_dataset_scaled)[:, 1]

#Option 1: Lower Threshold (for better sensitivity)
# threshold = 0.01
# final_predictions = (probabilities > threshold).astype(int)

# Option 2: Percentile-Based Threshold (Optional)
threshold = np.percentile(probabilities, 95)  # Top 5% as threshold
final_predictions = (probabilities > threshold).astype(int)

# ===== Show Results =====
results = pd.DataFrame({
    'Predicted': final_predictions,
    'Probability': probabilities
})

# Display Prediction Results
print("\n=== Prediction Results ===")
print(results['Predicted'].value_counts())

# Classification Report (if labels are available)
if 'isFraud' in fraud_dataset.columns:
    y_test = fraud_dataset['isFraud']
    print("\n=== Confusion Matrix ===")
    print(confusion_matrix(y_test, final_predictions))
    print("\n=== Classification Report ===")
    print(classification_report(y_test, final_predictions))

# Plot Confusion Matrix (Optional)
if 'isFraud' in fraud_dataset.columns:
    cm = confusion_matrix(y_test, final_predictions)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Feature Importance (Optional)
importances = loaded_model.feature_importances_
feature_names = expected_features
feature_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("\n=== Top 10 Important Features ===")
print(feature_importance.head(10))

# Preview Predictions
print("\n=== Sample Predictions ===")
print(results.head(20))





=== Prediction Results ===
Predicted
0    293666
1      6334
Name: count, dtype: int64

=== Top 10 Important Features ===
balance_change_ratio    0.208716
oldbalanceOrg           0.149948
newbalanceOrig          0.131391
accountDest             0.109018
amount                  0.083497
oldbalanceDest          0.080678
newbalanceDest          0.071017
type__PAYMENT           0.060438
type__TRANSFER          0.042184
accountOrig             0.030759
dtype: float64

=== Sample Predictions ===
    Predicted  Probability
0           0         0.06
1           0         0.06
2           0         0.00
3           0         0.04
4           1         0.12
5           0         0.00
6           0         0.06
7           0         0.06
8           0         0.04
9           0         0.04
10          0         0.00
11          0         0.06
12          0         0.04
13          0         0.00
14          0         0.04
15          0         0.00
16          0         0.04
17          0     