## Random Forest and Gradient Boosted Trees used for Fraud detection

Import libraires as follow.

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, countDistinct
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array, lit
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier

In [2]:
spark = SparkSession.builder.appName('FraudTreeMethods').getOrCreate()

Inserting the parent directory into current path.

In [3]:
import sys; sys.path.insert(1, '..\datasets')

## Gradient Boosted Trees

Load and parse the data file, converting it to a DataFrame.
You may use the SQL as follow to read data:
```python
data = sqlContext.sql("SELECT * FROM fraud_train_sample_csv")
```

In [5]:
data = spark.read.csv('../datasets/train_sample.csv', inferSchema=True, header=True)

Check out the data.

In [14]:
data.show(5)

+------+---+------+---+-------+-------------------+---------------+-------------+
|    ip|app|device| os|channel|         click_time|attributed_time|is_attributed|
+------+---+------+---+-------+-------------------+---------------+-------------+
| 87540| 12|     1| 13|    497|2017-11-07 09:30:38|           null|            0|
|105560| 25|     1| 17|    259|2017-11-07 13:40:27|           null|            0|
|101424| 12|     1| 19|    212|2017-11-07 18:05:24|           null|            0|
| 94584| 13|     1| 13|    477|2017-11-07 04:58:08|           null|            0|
| 68413| 12|     1|  1|    178|2017-11-09 09:00:09|           null|            0|
+------+---+------+---+-------+-------------------+---------------+-------------+
only showing top 5 rows



Count the number of classes and save it in variables names major_df and minor_df.

In [8]:
major_df = data.filter(col("is_attributed") == 0)
minor_df = data.filter(col("is_attributed") == 1)

In [9]:
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
a = range(ratio)

IllegalArgumentException: 'Unsupported class file major version 56'

In [6]:
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')

# combine both oversampled minority rows and previous majority rows combined_df = major_df.unionAll(oversampled_df)
combined_df = major_df.unionAll(oversampled_df)
combined_df.show()
data = combined_df
data = data.drop('click_time','attributed_time')

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
assembler = VectorAssembler(inputCols=['ip', 'app', 'device', 'os', 'channel'],outputCol="features")
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

IllegalArgumentException: 'Unsupported class file major version 56'

## Train the model

In [13]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="is_attributed", featuresCol="features", maxIter=20, maxDepth=4)

# Train model.  This also runs the indexers.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "is_attributed", "features").show(5)

+----------+-------------+--------------------+
|prediction|is_attributed|            features|
+----------+-------------+--------------------+
|       0.0|            0|[9.0,9.0,1.0,13.0...|
|       0.0|            0|[10.0,11.0,1.0,22...|
|       0.0|            0|[20.0,2.0,1.0,16....|
|       0.0|            0|[20.0,12.0,1.0,13...|
|       0.0|            0|[25.0,3.0,1.0,23....|
+----------+-------------+--------------------+
only showing top 5 rows



In [14]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="is_attributed", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test accuracy = %g" % (accuracy))

Test Error = 0.0482078
Test accuracy = 0.951792


In [15]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|30647|
|       1.0|29281|
+----------+-----+



# Apply to test, predict

In [16]:
test = spark.read.csv('test.csv', inferSchema=True, header=True)
#test.show(5)

assembler = VectorAssembler(inputCols=['ip', 'app', 'device', 'os', 'channel'],outputCol="features")
test = assembler.transform(test)
#test.show(3)

predictions = model.transform(test)
#predictions.show(2)

data_to_submit = predictions.select(['click_id','prediction'])
data_to_submit.show(3)

data_to_submit = data_to_submit.withColumnRenamed('prediction','is_attributed')
data_to_submit.show(3)

data_to_submit.groupBy('is_attributed').count().show()

+--------+----------+
|click_id|prediction|
+--------+----------+
|       0|       1.0|
|       1|       0.0|
|       2|       0.0|
+--------+----------+
only showing top 3 rows

+--------+-------------+
|click_id|is_attributed|
+--------+-------------+
|       0|          1.0|
|       1|          0.0|
|       2|          0.0|
+--------+-------------+
only showing top 3 rows

+-------------+--------+
|is_attributed|   count|
+-------------+--------+
|          0.0|17946533|
|          1.0|  843936|
+-------------+--------+

