Importing the libraries for data cleaning

In [76]:
import pandas as pd
import numpy as np

Importing the dataset and printing first 5 rows

In [77]:
df = pd.read_csv('Fraud.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


Removing duplicate rows

In [78]:
df.drop_duplicates(subset=["nameOrig"], keep="last", inplace=True)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


Dropping columns to reduce noise when training

In [79]:
df.drop('nameOrig', inplace=True, axis=1)
df.drop('step', inplace=True, axis=1)
df.drop('nameDest', inplace=True, axis=1)
df.drop('isFlaggedFraud', inplace=True, axis=1)

In [14]:
df.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0


Checking data formatting of 9th row

In [80]:
df.loc[8]

type              PAYMENT
amount            4024.36
oldbalanceOrg      2671.0
newbalanceOrig        0.0
oldbalanceDest        0.0
newbalanceDest        0.0
isFraud                 0
Name: 8, dtype: object

Encoding the 'Type' column using label encoding and finding what they stand for

In [84]:
from sklearn.preprocessing import LabelEncoder

# Assuming you have already fitted the LabelEncoder during preprocessing
label_encoder = LabelEncoder()

# Fit the encoder on the 'type' column
types = ['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN']
label_encoder.fit(types)

# Find the mapping of unique string values to encoded labels
class_mapping = list(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

# Print the mapping
print("Label Encoding Mapping:")
for category, encoded_value in class_mapping:
    print(f"{category} -> {encoded_value}")


Label Encoding Mapping:
CASH_IN -> 0
CASH_OUT -> 1
DEBIT -> 2
PAYMENT -> 3
TRANSFER -> 4


Importing xgboost model for training

In [24]:
import xgboost as xgb

Training the model

In [25]:
from sklearn.model_selection import train_test_split

# Defining x and y
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Finding best hyperparameters

In [47]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import optuna

def objective(trial):
    # Defining the hyperparameters to tune for binary classification
    params = {
        "objective": "binary:logistic",  # Binary classification objective
        "n_estimators": 1000,            # Number of estimators (trees)
        "verbosity": 0,                  # Suppress output
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),  # Learning rate
        "max_depth": trial.suggest_int("max_depth", 1, 10),  # Maximum depth of trees
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),  # Subsampling ratio
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),  # Column sampling per tree
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),  # Minimum child weight
    }

    # Initialize XGBoost classifier with the suggested hyperparameters
    model = xgb.XGBClassifier(**params)
    
    # Fit the model on training data
    model.fit(X_train, y_train, verbose=False)
    
    # Make predictions on the validation set
    predictions = model.predict(X_test)
    
    # Calculate accuracy (or any other metric suitable for classification)
    accuracy = accuracy_score(y_test, predictions)
    
    return 1 - accuracy  # Optuna minimizes the objective, so we return 1 - accuracy to maximize it


In [48]:
# Creating the Optuna study and running optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=25)
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

[I 2024-12-04 17:33:02,112] A new study created in memory with name: no-name-ce35d1d1-5345-46b0-b730-7f093aadeff8
[I 2024-12-04 17:35:43,632] Trial 0 finished with value: 0.001266269078637694 and parameters: {'learning_rate': 0.006588922380039456, 'max_depth': 2, 'subsample': 0.1687759160659278, 'colsample_bytree': 0.07919613958669043, 'min_child_weight': 5}. Best is trial 0 with value: 0.001266269078637694.
[I 2024-12-04 17:38:57,713] Trial 1 finished with value: 0.0003871997431260654 and parameters: {'learning_rate': 0.01626795657170186, 'max_depth': 4, 'subsample': 0.6227621474328181, 'colsample_bytree': 0.7006806420912535, 'min_child_weight': 1}. Best is trial 1 with value: 0.0003871997431260654.
[I 2024-12-04 17:42:26,826] Trial 2 finished with value: 0.00037460788156096214 and parameters: {'learning_rate': 0.07070318115378102, 'max_depth': 5, 'subsample': 0.4872637453446773, 'colsample_bytree': 0.405849890917609, 'min_child_weight': 8}. Best is trial 2 with value: 0.0003746078815

Best hyperparameters: {'learning_rate': 0.018328567818997943, 'max_depth': 9, 'subsample': 0.868856423461877, 'colsample_bytree': 0.9129726766209201, 'min_child_weight': 12}
Best RMSE: 0.0003494241584307556


Best hyperparameter was found by running the above commented code. [Trial 22 finished with value: 0.016569848821596934 and parameters: {'learning_rate': 0.05435437137174813, 'max_depth': 8, 'subsample': 0.40344376142128324, 'colsample_bytree': 0.8781257402324484, 'min_child_weight': 16}. Best is trial 22 with value: 0.016569848821596934.]

In [68]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Calling the best parameters
best_params = study.best_trial.params

# Adjust n_estimators to 50 (for 50 epochs/rounds)
best_params['n_estimators'] = 50

# Create the XGBoost model with the best parameters
model = xgb.XGBClassifier(**best_params)

# Fit the model on the training data
model.fit(X_train, y_train, verbose=False)

# Make predictions on the training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Convert regression predictions into binary predictions (e.g., using 0.5 as the threshold)
threshold = 0.5
y_train_pred_binary = (y_train_pred >= threshold).astype(int)
y_test_pred_binary = (y_test_pred >= threshold).astype(int)

# Calculate metrics for the training set
accuracy_train = accuracy_score(y_train, y_train_pred_binary)
precision_train = precision_score(y_train, y_train_pred_binary)
recall_train = recall_score(y_train, y_train_pred_binary)
f1_train = f1_score(y_train, y_train_pred_binary)

# Calculate metrics for the test set
accuracy_test = accuracy_score(y_test, y_test_pred_binary)
precision_test = precision_score(y_test, y_test_pred_binary)
recall_test = recall_score(y_test, y_test_pred_binary)
f1_test = f1_score(y_test, y_test_pred_binary)

# Print out the evaluation metrics for both training and testing
print(f"Training Accuracy: {accuracy_train:.4f}")
print(f"Training Precision: {precision_train:.4f}")
print(f"Training Recall: {recall_train:.4f}")
print(f"Training F1-Score: {f1_train:.4f}")

print(f"Testing Accuracy: {accuracy_test:.4f}")
print(f"Testing Precision: {precision_test:.4f}")
print(f"Testing Recall: {recall_test:.4f}")
print(f"Testing F1-Score: {f1_test:.4f}")

# Optionally, you can also calculate the RMSE for both train and test datasets
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Training RMSE: {rmse_train:.4f}")
print(f"Testing RMSE: {rmse_test:.4f}")


Training Accuracy: 0.9996
Training Precision: 0.9958
Training Recall: 0.6786
Training F1-Score: 0.8072
Testing Accuracy: 0.9996
Testing Precision: 0.9899
Testing Recall: 0.6687
Testing F1-Score: 0.7982
Training RMSE: 0.0205
Testing RMSE: 0.0207


In [56]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")


Confusion Matrix:
[[1269042      11]
 [    533    1076]]
