In [1]:
import numpy
import pandas as pd

from itertools import chain
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, \
                               RobustScaler, \
                               StringIndexer, \
                               VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, \
                              ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, \
                                  col, \
                                  count, \
                                  create_map, \
                                  lit, \
                                  when, \
                                  udf  
from pyspark.sql.types import FloatType

# Load Dataset

In [2]:
spark = SparkSession.builder.appName('ads-ml').getOrCreate()

In [3]:
# df = spark.read.options(delimiter=';') \
#                .options(header=True) \
#                .options(inferSchema=True) \
#                .csv('training_data.csv')
# df.printSchema()
# pd.DataFrame(df.take(5), columns=df.columns).transpose()

# Preprocess Dataset

In [4]:
def time_diff_in_minutes(dt_0, dt_1):
    if dt_0 is None:
        return 0.0
    return round((dt_1 - dt_0).total_seconds() / 60.0, 1)    
    
# time_diff_in_min_udf = udf(time_diff_in_minutes, FloatType())
# preprocessed_df = df.withColumn('timeSinceLastStart', \
#                                 time_diff_in_min_udf(df.lastStart, df.timestamp)) \
#                     .drop("id", "timestamp", "lastStart")
# preprocessed_df.write.csv("preprocessed_training_data.csv")


# Count the number of null values per columns

# preprocessed_df.select([count(when(isnan(c), c)).alias(c) for c in preprocessed_df.columns]).show()
# preprocessed_df.write.format('csv') \
#                      .option("header", "true") \
#                      .save("preprocessed_training_data_csv")

In [5]:
df = spark.read.options(delimiter=',') \
               .options(header=True) \
               .options(inferSchema=True) \
               .csv('preprocessed_training_data_csv')
df.printSchema()
df = df.withColumnRenamed('install', 'label')
df = df.sample(fraction=0.2, withReplacement=False)

root
 |-- campaignId: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- softwareVersion: string (nullable = true)
 |-- sourceGameId: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- startCount: integer (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- clickCount: integer (nullable = true)
 |-- installCount: integer (nullable = true)
 |-- startCount1d: integer (nullable = true)
 |-- startCount7d: integer (nullable = true)
 |-- connectionType: string (nullable = true)
 |-- deviceType: string (nullable = true)
 |-- install: integer (nullable = true)
 |-- timeSinceLastStart: double (nullable = true)



In [6]:
class_count_df = df.groupby('label').agg({'label': 'count'})

n_1 = class_count_df.filter(df.label == '1').select("count(label)").collect()[0][0]
n_0 = class_count_df.filter(df.label == '0').select("count(label)").collect()[0][0]

w_1 = (n_0 + n_1) / (2.0 * n_1)
w_0 = (n_0 + n_1) / (2.0 * n_0)

class_weights = {0: w_0, 1: w_1}

mapping_expr = create_map([lit(x) for x in chain(*class_weights.items())])
df = df.withColumn("weights", mapping_expr.getItem(col("label")))

In [7]:
pd.DataFrame(df.take(5), columns=df.columns).head().transpose()

Unnamed: 0,0,1,2,3,4
campaignId,59687f0d896a6b0e5ce6ea15,59687f0d896a6b0e5ce6ea15,59687f0d896a6b0e5ce6ea15,59687f0d896a6b0e5ce6ea15,59687f0d896a6b0e5ce6ea15
platform,ios,ios,ios,ios,ios
softwareVersion,12.0.1,12.1.1,12.1,12.1.2,12.1.2
sourceGameId,1373094,1541862,1373094,1373094,1373094
country,US,US,US,US,US
startCount,20,26,4,32,19
viewCount,18,4,3,31,18
clickCount,0,1,1,3,0
installCount,0,0,0,4,0
startCount1d,13,7,1,11,7


# Vectorize Features

In [8]:
# It is prudent to store the split data in train/test folders for the sake of reproducibility
train_df, test_df = df.randomSplit([0.8, 0.2])
train_df.count(), test_df.count()

(599049, 149620)

In [9]:
categorical_feats = ['campaignId', 'platform', 'softwareVersion', 'sourceGameId', 'country', 
                   'connectionType', 'deviceType']
numerical_feats = ['startCount', 'viewCount', 'clickCount', 'installCount', 'startCount1d', 
                   'startCount7d', 'timeSinceLastStart']

In [10]:
stages = [] 
for column in categorical_feats:
    str_indexer = StringIndexer(inputCol=column, 
                                outputCol=column + "Index",
                                handleInvalid='keep')
    encoder = OneHotEncoder(inputCols=[str_indexer.getOutputCol()], 
                            outputCols=[column + "Vec"],
                            handleInvalid='keep')
    stages += [str_indexer, encoder]

In [11]:
assembler1 = VectorAssembler(inputCols=numerical_feats, 
                             outputCol="num_features")
scaler = RobustScaler(inputCol='num_features',
                      outputCol='scaled')
stages += [assembler1, scaler]

In [12]:
assembler_inputs = [c + "Vec" for c in categorical_feats] + ["scaled"]
assembler2 = VectorAssembler(inputCols=assembler_inputs,
                             outputCol="features")
stages += [assembler2]

In [13]:
lr = LogisticRegression(weightCol='weights')
stages.append(lr)

In [14]:
pipeline = Pipeline(stages=stages)

In [15]:
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [1.0, 0.1, 0.01, 0.001]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [16]:
train_val = TrainValidationSplit(estimator=pipeline,
                                 estimatorParamMaps=param_grid,
                                 evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
                                 trainRatio=0.8)

In [17]:
model = train_val.fit(train_df)

In [18]:
# print(model.bestModel.stages[-1].extractParamMap())
print(model.bestModel.stages[-1].explainParam('regParam'))
print(model.bestModel.stages[-1].explainParam('elasticNetParam'))

regParam: regularization parameter (>= 0). (default: 0.0, current: 0.01)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.5)


In [19]:
predictions = model.transform(test_df)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Let's use the run-of-the-mill evaluator
evaluator = BinaryClassificationEvaluator()

# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))

# pipeline_model = pipeline.fit(train_df)
# predictions = pipeline_model.transform(test_df)
# predictions.select("campaignId", "prediction").show(5)
# predictions.select("campaignId", "prediction").show(195)

Area under ROC Curve: 0.7073
Area under PR Curve: 0.0287


In [21]:
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel",
                                 evaluator.metricLabel: 1})
evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel",
                                 evaluator.metricLabel: 1})

0.02213845623971528

In [22]:
# import matplotlib.pyplot as plt

In [23]:
# class CurveMetrics(BinaryClassificationMetrics):
#     def __init__(self, *args):
#         super(CurveMetrics, self).__init__(*args)

#     def _to_list(self, rdd):
#         points = []
#         # Note this collect could be inefficient for large datasets 
#         # considering there may be one probability per datapoint (at most)
#         # The Scala version takes a numBins parameter, 
#         # but it doesn't seem possible to pass this from Python to Java
#         for row in rdd.collect():
#             # Results are returned as type scala.Tuple2, 
#             # which doesn't appear to have a py4j mapping
#             points += [(float(row._1()), float(row._2()))]
#         return points

#     def get_curve(self, method):
#         rdd = getattr(self._java_model, method)().toJavaRDD()
#         return self._to_list(rdd)

# # Returns as a list (false positive rate, true positive rate)
# preds = predictions.select('label','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['label'])))
# points = CurveMetrics(preds).get_curve('roc')

# plt.figure()
# x_val = [x[0] for x in points]
# y_val = [x[1] for x in points]
# plt.title("title")
# plt.xlabel("")
# plt.ylabel("ylabel")
# plt.plot(x_val, y_val)

In [24]:
# plt.show()