In [1]:
import pyspark
import pyspark.ml
import pyspark.sql

In [2]:
sc = pyspark.SparkContext("local", "RF")
ss = pyspark.sql.SparkSession(sc)

In [3]:
df = ss.read.csv("data/PS_20174392719_1491204439457_log.csv", inferSchema=True, header=True).limit(int(1e5))

In [4]:
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [5]:
df = df.drop("isFlaggedFraud", "step", "nameDest", "nameOrig")

In [6]:
df_train, df_test = df.randomSplit([0.9, 0.1], seed=16)

In [7]:
cat_cols = [c for c, dtype in df_train.dtypes if dtype == "string"]
num_cols = list(set(df_train.columns) - set(cat_cols) - {"isFraud"})

In [8]:
print(cat_cols)
print(num_cols)

['type']
['oldbalanceDest', 'newbalanceOrig', 'amount', 'oldbalanceOrg', 'newbalanceDest']


In [9]:
pipeline = pyspark.ml.Pipeline().setStages([
    pyspark.ml.feature.VectorAssembler(inputCols=num_cols, outputCol="vec_feats_num"),
    pyspark.ml.feature.StandardScaler(inputCol="vec_feats_num", outputCol="standardized"),
    pyspark.ml.feature.StringIndexer(inputCol="type", outputCol="type_id", handleInvalid="skip"),
    pyspark.ml.feature.OneHotEncoder(inputCol="type_id", outputCol="type_ohe"),
    pyspark.ml.feature.VectorAssembler(inputCols=["type_ohe", "standardized"], outputCol="features"),
])

pipeline = pipeline.fit(df_train)

X_train = pipeline.transform(df_train)
X_test = pipeline.transform(df_test)

In [10]:
model = pyspark.ml.classification.RandomForestClassifier(
    labelCol="isFraud",
    featuresCol="features",
    numTrees=50,
).fit(X_train)

In [11]:
y_preds = model.transform(X_test)

In [12]:
y_preds.select("rawPrediction").show(10)

+--------------------+
|       rawPrediction|
+--------------------+
|[49.9794522652633...|
|[49.9794522652633...|
|[49.9342634155234...|
|[49.9794522652633...|
|[49.9794522652633...|
|[49.9794522652633...|
|[49.9385793550974...|
|[49.9794522652633...|
|[49.9381363649365...|
|[49.9773127251739...|
+--------------------+
only showing top 10 rows



In [13]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator(
    labelCol="isFraud",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC")

roc_auc = evaluator.evaluate(y_preds)
print(f"ROC AUC : {roc_auc:.3f}")


evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator(
    labelCol="isFraud",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderPR")

pr_auc = evaluator.evaluate(y_preds)
print(f"PR AUC  : {pr_auc:.3f}")

ROC AUC : 0.990
PR AUC  : 0.675


In [14]:
evaluator = pyspark.ml.evaluation.MulticlassClassificationEvaluator(
    labelCol="isFraud",
    predictionCol="prediction",
    metricName="f1",
)

f1 = evaluator.evaluate(y_preds)
print(f"PR AUC  : {f1:.3f}")

PR AUC  : 0.998
