# Random Forest and Gradient Boosted Trees used for Fraud detection

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct
from pyspark.sql.functions import col, explode, array, lit
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

ModuleNotFoundError: No module named 'pyspark'

In [3]:
spark = SparkSession.builder.appName('FraudTreeMethods').getOrCreate()

## Read data

In [3]:
# Load and parse the data file, converting it to a DataFrame.
#data = sqlContext.sql("SELECT * FROM fraud_train_sample")
data = spark.read.csv('train.csv', inferSchema=True, header=True)
data.show(5)

+------+---+------+---+-------+-------------------+---------------+-------------+
|    ip|app|device| os|channel|         click_time|attributed_time|is_attributed|
+------+---+------+---+-------+-------------------+---------------+-------------+
| 83230|  3|     1| 13|    379|2017-11-06 14:32:21|           null|            0|
| 17357|  3|     1| 19|    379|2017-11-06 14:33:34|           null|            0|
| 35810|  3|     1| 13|    379|2017-11-06 14:34:12|           null|            0|
| 45745| 14|     1| 13|    478|2017-11-06 14:34:52|           null|            0|
|161007|  3|     1| 13|    379|2017-11-06 14:35:08|           null|            0|
+------+---+------+---+-------+-------------------+---------------+-------------+


## Count distinct values

In [4]:
data.agg(*(countDistinct(col(c)).alias(c) for c in data.columns)).show()

+------+---+------+---+-------+----------+---------------+-------------+
|    ip|app|device| os|channel|click_time|attributed_time|is_attributed|
+------+---+------+---+-------+----------+---------------+-------------+
|277396|706|  3475|800|    202|    259620|         182057|            2|
+------+---+------+---+-------+----------+---------------+-------------+


In [5]:
data.groupBy('is_attributed').count().show()

+-------------+---------+
|is_attributed|    count|
+-------------+---------+
|            1|   456846|
|            0|184447044|
+-------------+---------+


## Over sampling

### The imbalance ratio of 0 and 1 are calculated at the following and it is used for oversampling procedure.

In [6]:
major_df = data.filter(col("is_attributed") == 0)
minor_df = data.filter(col("is_attributed") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 403


### The ratio is applied for oversampling at the following.

In [7]:
a = range(ratio)
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows combined_df = major_df.unionAll(oversampled_df)
combined_df = major_df.unionAll(oversampled_df)
combined_df.show(3)

+-----+---+------+---+-------+-------------------+---------------+-------------+
|   ip|app|device| os|channel|         click_time|attributed_time|is_attributed|
+-----+---+------+---+-------+-------------------+---------------+-------------+
|83230|  3|     1| 13|    379|2017-11-06 14:32:21|           null|            0|
|17357|  3|     1| 19|    379|2017-11-06 14:33:34|           null|            0|
|35810|  3|     1| 13|    379|2017-11-06 14:34:12|           null|            0|
+-----+---+------+---+-------+-------------------+---------------+-------------+


### The data is combined at the following and is presented.

In [8]:
combined_df.groupBy('is_attributed').count().show()

+-------------+---------+
|is_attributed|    count|
+-------------+---------+
|            1|184108938|
|            0|184447044|
+-------------+---------+


### Replace the new data (combined_df) into the old data.

In [9]:
data = combined_df

In [10]:
data.groupBy('is_attributed').count().show()

+-------------+---------+
|is_attributed|    count|
+-------------+---------+
|            1|184108938|
|            0|184447044|
+-------------+---------+


In [11]:
data.agg(*(countDistinct(col(c)).alias(c) for c in data.columns)).show()

+------+---+------+---+-------+----------+---------------+-------------+
|    ip|app|device| os|channel|click_time|attributed_time|is_attributed|
+------+---+------+---+-------+----------+---------------+-------------+
|277396|706|  3475|800|    202|    259620|         182057|            2|
+------+---+------+---+-------+----------+---------------+-------------+


### Drop two columns including the click time and attributed time. This is done for sack of simplicity in this toturial. 

In [12]:
data = data.drop('click_time','attributed_time')
data.show(3)

+-----+---+------+---+-------+-------------+
|   ip|app|device| os|channel|is_attributed|
+-----+---+------+---+-------+-------------+
|83230|  3|     1| 13|    379|            0|
|17357|  3|     1| 19|    379|            0|
|35810|  3|     1| 13|    379|            0|
+-----+---+------+---+-------+-------------+


In [13]:
data.head()

Row(ip=83230, app=3, device=1, os=13, channel=379, is_attributed=0)

### Split data.

In [14]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [15]:
trainingData.printSchema()

root
 |-- ip: integer (nullable = true)
 |-- app: integer (nullable = true)
 |-- device: integer (nullable = true)
 |-- os: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- is_attributed: integer (nullable = true)


In [16]:
trainingData.show(3)

+---+---+------+---+-------+-------------+
| ip|app|device| os|channel|is_attributed|
+---+---+------+---+-------+-------------+
|  9|  3|     1| 13|    115|            0|
|  9|  3|     1| 13|    135|            0|
|  9|  3|     1| 13|    280|            0|
+---+---+------+---+-------+-------------+


In [17]:
data.columns

['ip', 'app', 'device', 'os', 'channel', 'is_attributed']

### Vectorize the data before feeding into the ML algorithm. Determine the columns name as follow.

In [18]:
assembler = VectorAssembler(inputCols=['ip', 'app', 'device', 'os', 'channel'],outputCol="features")
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

In [19]:
trainingData.show(3)

+---+---+------+---+-------+-------------+--------------------+
| ip|app|device| os|channel|is_attributed|            features|
+---+---+------+---+-------+-------------+--------------------+
|  9|  3|     1| 13|    115|            0|[9.0,3.0,1.0,13.0...|
|  9|  3|     1| 13|    135|            0|[9.0,3.0,1.0,13.0...|
|  9|  3|     1| 13|    280|            0|[9.0,3.0,1.0,13.0...|
+---+---+------+---+-------+-------------+--------------------+


### Asign the model, train the model, and predict.

In [22]:
#https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.classification
# Train a RandomForest model.
#rf = RandomForestClassifier(labelCol="is_attributed", featuresCol="features", numTrees=20)
rf = RandomForestClassifier(labelCol="is_attributed", featuresCol="features", numTrees=30, maxDepth=3)

In [23]:
# Train model.  This also runs the indexers.
model = rf.fit(trainingData)

In [24]:
# Make predictions.
predictions = model.transform(testData)

In [25]:
predictions.printSchema()

root
 |-- ip: integer (nullable = true)
 |-- app: integer (nullable = true)
 |-- device: integer (nullable = true)
 |-- os: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- is_attributed: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)


In [26]:
# Select example rows to display.
predictions.select("prediction", "is_attributed", "features").show(5)

+----------+-------------+--------------------+
|prediction|is_attributed|            features|
+----------+-------------+--------------------+
|       0.0|            0|[9.0,2.0,1.0,19.0...|
|       0.0|            0|[9.0,3.0,1.0,13.0...|
|       0.0|            0|[9.0,3.0,1.0,13.0...|
|       0.0|            0|[9.0,3.0,1.0,13.0...|
|       0.0|            0|[9.0,9.0,1.0,13.0...|
+----------+-------------+--------------------+


### Evaluate the model performance.

In [27]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="is_attributed", predictionCol="prediction", metricName="accuracy")

In [28]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test accuracy = %g" % (accuracy))

Test Error = 0.154955
Test accuracy = 0.845045


In [29]:
# Not a very good example to show this!
model.featureImportances

SparseVector(5, {0: 0.1632, 1: 0.6297, 2: 0.0535, 3: 0.0019, 4: 0.1517})

### Apply the model to test dataset and predict.
#### Read the test data first and prepare the format so it can be fed into the model.

In [30]:
data = spark.read.csv('test.csv', inferSchema=True, header=True)
data.show(5)

+--------+------+---+------+---+-------+-------------------+
|click_id|    ip|app|device| os|channel|         click_time|
+--------+------+---+------+---+-------+-------------------+
|       0|  5744|  9|     1|  3|    107|2017-11-10 04:00:00|
|       1|119901|  9|     1|  3|    466|2017-11-10 04:00:00|
|       2| 72287| 21|     1| 19|    128|2017-11-10 04:00:00|
|       3| 78477| 15|     1| 13|    111|2017-11-10 04:00:00|
|       4|123080| 12|     1| 13|    328|2017-11-10 04:00:00|
+--------+------+---+------+---+-------+-------------------+


In [31]:
#data = data.drop('click_time','attributed_time')
test = data
#test = data.select(['ip','app','device','os','channel'])
test.show(3)

+--------+------+---+------+---+-------+-------------------+
|click_id|    ip|app|device| os|channel|         click_time|
+--------+------+---+------+---+-------+-------------------+
|       0|  5744|  9|     1|  3|    107|2017-11-10 04:00:00|
|       1|119901|  9|     1|  3|    466|2017-11-10 04:00:00|
|       2| 72287| 21|     1| 19|    128|2017-11-10 04:00:00|
+--------+------+---+------+---+-------+-------------------+


### Vecorize the test data and transfer it as follow.

In [32]:
assembler = VectorAssembler(inputCols=['ip', 'app', 'device', 'os', 'channel'],outputCol="features")
test = assembler.transform(test)

In [33]:
test.show(3)

+--------+------+---+------+---+-------+-------------------+--------------------+
|click_id|    ip|app|device| os|channel|         click_time|            features|
+--------+------+---+------+---+-------+-------------------+--------------------+
|       0|  5744|  9|     1|  3|    107|2017-11-10 04:00:00|[5744.0,9.0,1.0,3...|
|       1|119901|  9|     1|  3|    466|2017-11-10 04:00:00|[119901.0,9.0,1.0...|
|       2| 72287| 21|     1| 19|    128|2017-11-10 04:00:00|[72287.0,21.0,1.0...|
+--------+------+---+------+---+-------+-------------------+--------------------+


### Fed the test data set into the model and predict.

In [34]:
predictions = model.transform(test)

In [35]:
predictions.show(2)

+--------+------+---+------+---+-------+-------------------+--------------------+--------------------+--------------------+----------+
|click_id|    ip|app|device| os|channel|         click_time|            features|       rawPrediction|         probability|prediction|
+--------+------+---+------+---+-------+-------------------+--------------------+--------------------+--------------------+----------+
|       0|  5744|  9|     1|  3|    107|2017-11-10 04:00:00|[5744.0,9.0,1.0,3...|[20.6742278268919...|[0.68914092756306...|       0.0|
|       1|119901|  9|     1|  3|    466|2017-11-10 04:00:00|[119901.0,9.0,1.0...|[24.8633981858914...|[0.82877993952971...|       0.0|
+--------+------+---+------+---+-------+-------------------+--------------------+--------------------+--------------------+----------+


### Prepare a submiting file as follow and determine the columns name.

In [36]:
data_to_submit = predictions.select(['click_id','prediction'])
data_to_submit.show(3)

+--------+----------+
|click_id|prediction|
+--------+----------+
|       0|       0.0|
|       1|       0.0|
|       2|       1.0|
+--------+----------+


In [37]:
data_to_submit = data_to_submit.withColumnRenamed('prediction','is_attributed')
data_to_submit.show(3)

+--------+-------------+
|click_id|is_attributed|
+--------+-------------+
|       0|          0.0|
|       1|          0.0|
|       2|          1.0|
+--------+-------------+


### Lets before submiting the results, look inside and see the model performance.

In [38]:
data_to_submit.groupBy('is_attributed').count().show()

+-------------+--------+
|is_attributed|   count|
+-------------+--------+
|          0.0|15921222|
|          1.0| 2869247|
+-------------+--------+


#### Not very balance results. We need to improve the results by tuning the model hyperparameters plus some data feathering are suggested.

#### Save the output file and submit it.

In [None]:
#data_to_submit.to_csv('RFmodeling.csv', index = False)

In [None]:
#data_to_submit.coalesce(1).write.option("header","true").option("sep",",").mode("overwrite").csv("output/path")