In [10]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

try:
    # Get the base dataset path from environment variable
    base_path = os.getenv("DATASET_PATH")
    if base_path is None or base_path.strip() == "":
        raise EnvironmentError("The environment variable 'DATASET_PATH' is not set or is empty.")

    # Load the scaled data
    scaled_path = os.path.join(base_path, "scaled_data.csv")
    df = pd.read_csv(scaled_path)
    print("✅ Scaled data loaded successfully from:", scaled_path)
    print(df.head())

    # Load the original data (with target column)
    original_path = os.path.join(base_path, "transactions.csv")
    raw_df = pd.read_csv(original_path)
    print("\n✅ Original data loaded successfully from:", original_path)
    print(raw_df.head())

except FileNotFoundError as e:
    print(f"❌ File not found: {e.filename}")
except pd.errors.ParserError as e:
    print(f"❌ CSV parsing error: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ Scaled data loaded successfully from: ../data/scaled_data.csv
     amount  oldBalInitiator  newBalInitiator  oldBalRecipient  \
0 -0.401636        -1.717325        -1.754478        -0.484986   
1 -0.676460        -1.861130        -1.883910        -0.485028   
2 -0.677219        -1.861065        -1.884266        -0.485028   
3 -0.401321        -1.861483        -1.899568        -0.464584   
4 -0.677204        -1.739208        -1.761637        -0.485028   

   newBalRecipient  transactionType_DEPOSIT  transactionType_PAYMENT  \
0        -0.414541                      0.0                      0.0   
1        -0.511633                      0.0                      1.0   
2        -0.508961                      0.0                      1.0   
3        -0.394835                      0.0                      0.0   
4        -0.508956                      0.0                      1.0   

   transactionType_TRANSFER  transactionType_WITHDRAWAL  
0                       1.0                     

In [11]:
try:
    # Ensure both DataFrames are loaded
    if 'df' not in locals() or 'raw_df' not in locals():
        raise ValueError("Both 'df' (scaled) and 'raw_df' (original) must be loaded.")

    # Check if 'isFraud' exists in the original dataset
    if 'isFraud' not in raw_df.columns:
        raise KeyError("'isFraud' column not found in original dataset.")

    # Add the target column to the scaled data
    df['isFraud'] = raw_df['isFraud']

    # Save updated file
    updated_path = os.path.join(base_path, "scaled_data.csv")
    df.to_csv(updated_path, index=False)

    print("✅ 'isFraud' column successfully merged and updated CSV saved.")
    print(df.head())

except Exception as e:
    print(f"❌ Failed to merge 'isFraud' into scaled dataset: {e}")

✅ 'isFraud' column successfully merged and updated CSV saved.
     amount  oldBalInitiator  newBalInitiator  oldBalRecipient  \
0 -0.401636        -1.717325        -1.754478        -0.484986   
1 -0.676460        -1.861130        -1.883910        -0.485028   
2 -0.677219        -1.861065        -1.884266        -0.485028   
3 -0.401321        -1.861483        -1.899568        -0.464584   
4 -0.677204        -1.739208        -1.761637        -0.485028   

   newBalRecipient  transactionType_DEPOSIT  transactionType_PAYMENT  \
0        -0.414541                      0.0                      0.0   
1        -0.511633                      0.0                      1.0   
2        -0.508961                      0.0                      1.0   
3        -0.394835                      0.0                      0.0   
4        -0.508956                      0.0                      1.0   

   transactionType_TRANSFER  transactionType_WITHDRAWAL  isFraud  
0                       1.0              

In [12]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [14]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight='balanced',  # handles class imbalance
    random_state=42,
    n_jobs=-1
)

In [15]:
# Split features (X) and target (y)
X = df.drop(columns=['isFraud'])
y = df['isFraud']

# Train-test split (stratify ensures class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight='balanced',  # important for fraud detection
    random_state=42,
    n_jobs=-1
)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)

print("✅ Random Forest Results")
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Random Forest Results
              precision    recall  f1-score   support

           0     0.8964    0.9923    0.9419    294828
           1     0.1261    0.0096    0.0179     34152

    accuracy                         0.8903    328980
   macro avg     0.5112    0.5009    0.4799    328980
weighted avg     0.8164    0.8903    0.8460    328980

ROC AUC Score: 0.5009483446598749
Confusion Matrix:
 [[292547   2281]
 [ 33823    329]]
