In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("GBT Fraud Detection - Prediction & Evaluation") \
    .getOrCreate()

# Load the dataset (if necessary)
df = spark.read.csv('Synthetic_Financial_datasets_log.csv', header=True, inferSchema=True)

# Index the categorical column 'type'
indexer = StringIndexer(inputCol='type', outputCol='type_index')

# Select features for the model
feature_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'type_index']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a pipeline
pipeline = Pipeline(stages=[indexer, assembler])

# Transform the data using the same pipeline as during training
df_transformed = pipeline.fit(df).transform(df)

# Split the data into train and test sets (same split as used for training)
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)

# Load the pre-trained GBT model
from pyspark.ml.classification import GBTClassificationModel
gbt_model = GBTClassificationModel.load("/data/gbt_fraud_detection_model")  # Adjust path as needed

In [3]:
def predict_fraud(input_data):
    # Create a DataFrame from the input
    input_df = spark.createDataFrame([input_data], schema=['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])

    # Transform the input using the same pipeline
    input_transformed = pipeline.fit(input_df).transform(input_df)

    # Make predictions
    prediction = gbt_model.transform(input_transformed)
    
    # Return prediction result
    return prediction.select('prediction').collect()[0][0]

# Example usage of the predict_fraud function
user_input = [355, 'Debit', 9000, 9000, 0, 0, 9000]  # Example input
result = predict_fraud(user_input)
print("Prediction:", "Fraud" if result == 1 else "Not Fraud")

Prediction: Fraud


In [4]:
predictions = gbt_model.transform(test_data)

# Show a sample of predictions
predictions.select('features', 'isFraud', 'prediction').show(5)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='isFraud', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy on test set: {accuracy:.4f}")

# Optionally, you can use other evaluation metrics (e.g., precision, recall)
precision_evaluator = MulticlassClassificationEvaluator(labelCol='isFraud', predictionCol='prediction', metricName='weightedPrecision')
recall_evaluator = MulticlassClassificationEvaluator(labelCol='isFraud', predictionCol='prediction', metricName='weightedRecall')

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

print(f"Precision on test set: {precision:.4f}")
print(f"Recall on test set: {recall:.4f}")


+--------------------+-------+----------+
|            features|isFraud|prediction|
+--------------------+-------+----------+
|[1.0,783.31,81503...|      0|       0.0|
|[1.0,1271.77,6973...|      0|       0.0|
|[1.0,2643.45,6434...|      0|       0.0|
|[1.0,6284.18,7858...|      0|       0.0|
|[1.0,8679.13,7087...|      0|       0.0|
+--------------------+-------+----------+
only showing top 5 rows

Accuracy on test set: 0.9823
Precision on test set: 0.9988
Recall on test set: 0.9823
