# App Download Prediction Based on Customer Click Data
Kaggle competition data: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col
import os.path
from pyspark.sql.window import Window

from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

The dataset is loaded.

In [4]:
tr_raw = spark.read.parquet("/mnt/dbdata/fraud/train_parquet").orderBy(['ip', 'click_time'], ascending = True).persist()

In [5]:
display(tr_raw)

ip,app,device,os,channel,click_time,attributed_time,is_attributed
1,2,1,15,477,2017-11-09T01:25:31.000+0000,,0
1,12,1,15,265,2017-11-09T01:25:35.000+0000,,0
1,6,1,15,459,2017-11-09T01:25:36.000+0000,,0
1,64,1,15,459,2017-11-09T01:25:36.000+0000,,0
1,2,1,2,477,2017-11-09T01:47:39.000+0000,,0
1,13,1,28,469,2017-11-09T03:37:16.000+0000,,0
1,2,1,6,477,2017-11-09T03:59:36.000+0000,,0
1,2,1,49,477,2017-11-09T04:05:42.000+0000,,0
1,2,1,2,477,2017-11-09T04:08:39.000+0000,,0
1,2,1,2,477,2017-11-09T04:10:32.000+0000,,0


The number of rows of the data

In [7]:
tr_raw.count()

The number of unique IP's

In [9]:
tr_raw.select('ip').distinct().count()

Click_time statistics

In [11]:
tr_raw.select(F.min('click_time')).show()

In [12]:
tr_raw.select(F.max('click_time')).show()

During night time, clickes are low. Periodic patterns in the clicks.

In [14]:
tr_raw.registerTempTable("tr_raw")

In [15]:
%sql
SELECT date_format(click_time, "MMM-dd HH:mm") as time FROM tr_raw

time
Nov-09 01:25
Nov-09 01:25
Nov-09 01:25
Nov-09 01:25
Nov-09 01:47
Nov-09 03:37
Nov-09 03:59
Nov-09 04:05
Nov-09 04:08
Nov-09 04:10


In [16]:
# display(tr_raw.select(F.unix_timestamp('click_time').alias('Time')))
%sql 
SELECT date_format(click_time, "MMM-dd HH:mm") as time FROM tr_raw

Conversions also follow the similar periodic patterns.

In [18]:
tr_raw.registerTempTable("tr_raw")

In [19]:
%sql 
SELECT unix_timestamp(attributed_time) as Converted_Time
FROM tr_raw
WHERE attributed_time IS NOT NULL

Converted_Time
1510220622
1510220727
1510260389
1510260410
1510261465
1510268177
1510268194
1510269354
1510269491
1510134871


In [20]:
%sql
SELECT unix_timestamp(click_time) as Click_Time
FROM tr_raw
WHERE ip = 7340

Click_Time
1510013253
1510013253
1510014120
1510014479
1510016512
1510018373
1510018380
1510020884
1510020885
1510020888


Aim: To Predict Conversion Rate (App Download Rate) per IP.

In [22]:
distinct_ip = tr_raw.select('ip').distinct()
tot_num_ip = distinct_ip.count()
ips_training = distinct_ip.orderBy(F.rand()).limit(round(tot_num_ip*0.7))

# 70% of tr_raw is going to be used for training.
# 30% of tr_raw is going to be used for validation. 
tr_training = tr_raw.join(ips_training, 'ip')
tr_val = tr_raw.join(ips_training, 'ip', 'left_anti')

In [23]:
# Time-stamp data are converted to Unix times.
unixTimeTrans = SQLTransformer(
    statement="SELECT *, unix_timestamp(click_time) AS click_time_unix \
               FROM __THIS__")

# previous time stamp are created by using window function. 
prevTimeTrans = SQLTransformer(
    statement="SELECT *, LAG(click_time_unix) OVER (PARTITION BY ip ORDER BY click_time_unix ASC) \
                                                    AS prev_click_time_unix \
               FROM __THIS__")

# Time interval between two consecutive clicks from the same ip are computed. 
intervalTimeTrans = SQLTransformer(
    statement="SELECT *, (CASE WHEN prev_click_time_unix IS NULL \
                               THEN 0 \
                               ELSE click_time_unix - prev_click_time_unix END) AS click_interval \
               FROM __THIS__")

# For each ip, 
# clicks = number of total clicks
# click_time_tot_int = time difference between the first and the last clicks for a given IP.
# conversions = number of app downloads (conversions) 
# conversion rate = number of app downloads / number of ad clicks
# and so on. 

featureTrans = SQLTransformer(
    statement = "SELECT ip, COUNT(*) clicks, \
                        MAX(click_time_unix) - MIN(click_time_unix) as click_time_tot_int, \
                        MEAN(click_interval) as click_time_int_mean, \
                        STDDEV(click_interval) as click_time_int_std, \
                        COUNT(DISTINCT app) as app_count, \
                        COUNT(DISTINCT device) as device_count, \
                        COUNT(DISTINCT os) as os_count, \
                        COUNT(DISTINCT channel) as channel_count, \
                        SUM(is_attributed) as conversions, \
                        SUM(is_attributed)/COUNT(*) as conv_rate\
                 FROM __THIS__ \
                 GROUP BY ip")
pipeline_feature_eng = Pipeline(stages=[unixTimeTrans, prevTimeTrans, intervalTimeTrans, featureTrans])
tr_ip = pipeline_feature_eng.fit(tr_training).transform(tr_training).persist()

The content of the dataframe, tr_ip, is shown here.

In [25]:
display(tr_ip)

ip,clicks,click_time_tot_int,click_time_int_mean,click_time_int_std,app_count,device_count,os_count,channel_count,conversions,conv_rate
1829,52,32407,623.2115384615385,2362.86064027366,17,2,4,31,1,0.0192307692307692
5518,2292,259004,113.00349040139616,500.0820072303256,42,14,49,102,3,0.0013089005235602
6336,20882,259170,12.411167512690357,34.29575244657039,86,15,79,123,22,0.0010535389330523
6620,4034,259101,64.22930094199306,243.1952309997336,44,9,50,111,5,0.0012394645513138
7240,401,85730,213.79052369077303,684.3758696871004,28,3,18,72,1,0.0024937655860349
7754,5312,259042,48.76543674698795,173.01248031784507,57,11,54,116,8,0.0015060240963855
7880,5512,258712,46.93613933236575,189.5856848595257,55,11,52,119,8,0.0014513788098693
8086,4496,258992,57.604982206405694,291.9706929265484,41,11,49,110,6,0.0013345195729537
9376,4124,258591,62.70392822502425,270.0170265722946,54,8,52,117,8,0.0019398642095053
9427,1513,171407,113.2894910773298,474.9601216920122,38,5,41,95,1,0.0006609385327164573


In [26]:
display(tr_ip)

ip,clicks,click_time_tot_int,click_time_int_mean,click_time_int_std,app_count,device_count,os_count,channel_count,conversions,conv_rate
1829,52,32407,623.2115384615385,2362.86064027366,17,2,4,31,1,0.0192307692307692
5518,2292,259004,113.00349040139616,500.0820072303256,42,14,49,102,3,0.0013089005235602
6336,20882,259170,12.411167512690357,34.29575244657039,86,15,79,123,22,0.0010535389330523
6620,4034,259101,64.22930094199306,243.1952309997336,44,9,50,111,5,0.0012394645513138
7240,401,85730,213.79052369077303,684.3758696871004,28,3,18,72,1,0.0024937655860349
7754,5312,259042,48.76543674698795,173.01248031784507,57,11,54,116,8,0.0015060240963855
7880,5512,258712,46.93613933236575,189.5856848595257,55,11,52,119,8,0.0014513788098693
8086,4496,258992,57.604982206405694,291.9706929265484,41,11,49,110,6,0.0013345195729537
9376,4124,258591,62.70392822502425,270.0170265722946,54,8,52,117,8,0.0019398642095053
9427,1513,171407,113.2894910773298,474.9601216920122,38,5,41,95,1,0.0006609385327164573


Now, let's move on to the ML training and prediction. First, I define features to use.

In [28]:
feature_cols = tr_ip.columns[1:-2]

In [29]:
# Another feature engineering
# To make the feature vectors and standardize individual feature having its mean value equal to 0 and its standard deviation equal to 1. 

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)

# Train a RandomForest model.
# RF default values: 
# maxDepth=5, 
# maxBins=32, 
# minInstancesPerNode=1, 
# minInfoGain=0.0, 
# maxMemoryInMB=256, 
# cacheNodeIds=False, 
# checkpointInterval=10, 
# impurity='gini', 
# numTrees=20, 
# featureSubsetStrategy='auto', 
# seed=None, 
# subsamplingRate=1.0

rf = RandomForestRegressor(featuresCol="scaledFeatures", labelCol = "conv_rate")
pipeline = Pipeline(stages=[pipeline_feature_eng, assembler, scaler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(tr_training)

After the training, I fed the test data set into the model to predict the conversion rate, and compared the predicted values with the actual values. 
As the plot shows, the predicted values are quite similar to the actual values.

In [31]:
# tr_val.write.parquet("/mnt/dbdata/fraud/kk_test_data")

In [32]:
predictions = model.transform(tr_val)
predictions.select("prediction", "conv_rate", "features").show(5)
predictions.persist()

conv_rate,prediction
0.0021929824561403,0.0034702183973222
0.0018050541516245,0.0034702183973222
0.00042589437819420784,0.0034702183973222
0.0007855459544383347,0.0034702183973222
0.000462962962962963,0.0034702183973222
0.00041736227045075126,0.0057443986618613
0.0044843049327354,0.0116290528816322
0.0009937065253395165,0.0034702183973222
0.0012463647694225,0.0034702183973222
0.001340632331583,0.0034702183973222


In [33]:
display(predictions.select('conv_rate', 'prediction'))

conv_rate,prediction
0.0021929824561403,0.0034702183973222
0.0018050541516245,0.0034702183973222
0.00042589437819420784,0.0034702183973222
0.0007855459544383347,0.0034702183973222
0.000462962962962963,0.0034702183973222
0.00041736227045075126,0.0057443986618613
0.0044843049327354,0.0116290528816322
0.0009937065253395165,0.0034702183973222
0.0012463647694225,0.0034702183973222
0.001340632331583,0.0034702183973222


R square value was 0.91.

In [35]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="conv_rate", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(
    labelCol="conv_rate", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Square (R^2) on test data = %g" % r2)

rfModel = model.stages[-1]
print(rfModel)  # summary only

The entire ML pipeline is 

Pipeline(stages=[unixTimeTrans, prevTimeTrans, intervalTimeTrans, featureTrans, assembler, scaler, rf]) 

with the raw data as an input.

In [37]:
# Save and load model
model.save("/mnt/dbdata/fraud/kk_pipeline_model")

from pyspark.ml import PipelineModel
loaded_model = PipelineModel.load("/mnt/dbdata/fraud/kk_pipeline_model")

In [38]:
loaded_model?

In [39]:
yhat = loaded_model.transform(tr_val)

In [40]:
x.show()

In [41]:
loaded_model.stages

In [42]:
loaded_model.stages[0].stages

In [43]:
loaded_model.stages[0].transform(tr_val).show()

In [44]:
pipeline.save("/mnt/dbdata/fraud/kk_pipeline_unfit")

In [45]:
loaded_pipeline = Pipeline.load("/mnt/dbdata/fraud/kk_pipeline_unfit")

In [46]:
loaded_pipeline