In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# User Settings
DATA_PATH = "Fraud.csv"        
TARGET_COLUMN = "isFraud"      
MODEL_OUTPUT = "fraud_model.pkl"
TRAINING_COLUMNS_OUTPUT = "training_columns.pkl"
LABEL_ENCODERS_OUTPUT = "label_encoders.pkl"
SAMPLE_FRACTION = 0.1          

# Load Data
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

# Optional sampling for faster training
if SAMPLE_FRACTION is not None:
    df = df.sample(frac=SAMPLE_FRACTION, random_state=42)
    print(f"Sampled {len(df)} rows for quick training.")

# Preprocessing
print("Preprocessing...")
# Drop obvious non-feature columns (optional)
df = df.drop(columns=['nameOrig', 'nameDest'], errors='ignore')

# Handle categorical columns with LabelEncoder
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Save training column names for later prediction alignment
joblib.dump(X.columns.tolist(), TRAINING_COLUMNS_OUTPUT)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model Training
print("Training RandomForestClassifier...")
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Save Model and Encoders
print(f"Saving model to {MODEL_OUTPUT}...")
joblib.dump(model, MODEL_OUTPUT)
joblib.dump(label_encoders, LABEL_ENCODERS_OUTPUT)

print("Model training complete. Model, label encoders, and training columns saved.")

Loading dataset...
Sampled 636262 rows for quick training.
Preprocessing...
Training RandomForestClassifier...

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    127090
           1       0.98      0.74      0.84       163

    accuracy                           1.00    127253
   macro avg       0.99      0.87      0.92    127253
weighted avg       1.00      1.00      1.00    127253

ROC AUC Score: 0.9966
Saving model to fraud_model.pkl...
Model training complete. Model, label encoders, and training columns saved.


In [8]:
import pandas as pd
import joblib

# Load Trained Model
MODEL_PATH = "fraud_model.pkl"
model = joblib.load(MODEL_PATH)

# Load Dataset
DATA_PATH = "Fraud.csv"
df_new = pd.read_csv(DATA_PATH)

# Drop target column
if 'isFraud' in df_new.columns:
    df_new_features = df_new.drop(['isFraud'], axis=1)
else:
    df_new_features = df_new.copy()

# Drop high-cardinality ID columns
drop_cols = ['nameOrig', 'nameDest']  # IDs not useful for prediction
df_new_features = df_new_features.drop(columns=drop_cols, errors='ignore')

# ===== Convert categorical 'type' into dummy variables =====
df_new_features = pd.get_dummies(df_new_features, columns=['type'], drop_first=True)

# ===== Align columns with training features =====
training_columns = joblib.load("training_columns.pkl")
df_new_features = df_new_features.reindex(columns=training_columns, fill_value=0)

# ===== Run Predictions =====
predictions = model.predict(df_new_features)
prediction_probs = model.predict_proba(df_new_features)[:, 1]

# ===== Save Predictions =====
df_results = df_new.copy()
df_results['predicted_isFraud'] = predictions
df_results['fraud_probability'] = prediction_probs

df_results.to_csv("fraud_predictions.csv", index=False)
print("Predictions saved to fraud_predictions.csv")

# Preview
print(df_results[['predicted_isFraud', 'fraud_probability']].head())

Predictions saved to fraud_predictions.csv
   predicted_isFraud  fraud_probability
0                  0              0.000
1                  0              0.000
2                  0              0.035
3                  0              0.035
4                  0              0.000
