# Sparkify Full dataset run

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.window import Window

from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.pipeline import PipelineModel

from datetime import datetime
import numpy as np

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1631347673663_0010,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
# rimestamp coefficient
TS_COEF = 1000*60*60*24

# today date
TODAY = str(datetime.today().date())

# S3 storage path
my_storage = 's3://sparkify-saved-models/'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load and Clean Dataset

In [4]:
# Read in full sparkify dataset
event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
df = spark.read.json(event_data)
df.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(artist='Popol Vuh', auth='Logged In', firstName='Shlok', gender='M', itemInSession=278, lastName='Johnson', length=524.32934, level='paid', location='Dallas-Fort Worth-Arlington, TX', method='PUT', page='NextSong', registration=1533734541000, sessionId=22683, song='Ich mache einen Spiegel - Dream Part 4', status=200, ts=1538352001000, userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='1749042')

# Feature Engineering

### Compile the modelling dataset
1. Exclude records with empty *userId*.
2. Add label: 1 = Churn, 0 = Not churn. Condition: `page='Cancellation Confirmation'`
3. Remove records of `page='Cancellation Confirmation'`.
4. Sort dataframe by `userId` and `ts`
5. Aggregate features at user level:
    * create list of songs
    * create list of artists
    * list of page events (Cancellation Confirmation preliminary filtered out to remove the leak)
    * session frequency
    * average number of songs per session
    * binary feature: Male gender = 1/0
    * binary feature: paid acoount = 1/0
    * lifetime (days): time difference between last activity and registration date

**Step 1**: Aggregate user-level properties

In [5]:
w = Window.partitionBy(df.userId).orderBy(df.ts)
w_uid = Window.partitionBy(df.userId)

preprocessed_df = (df
                   .filter(F.col('userId')!='') #filter out guests
                   .withColumn('cancelled', (F.col('page')=='Cancellation Confirmation').cast(IntegerType())) 
                   .withColumn('churn', F.max('cancelled').over(w_uid)) # define churn label
                   .withColumn('current_level', F.last('level').over(w)) # sort levels of subscription by date
                   .withColumn('last_userAgent', F.last('userAgent').over(w)) # sort agents by date
                   .filter(F.col('page')!='Cancellation Confirmation') #remove page event from dataset
                   .groupby('userId') # aggregate features at user level
                   .agg(F.collect_list('artist').alias('artist_list'), # combine into list all artist
                        F.collect_list('song').alias('song_list'), # combine into list all songs
                        F.collect_list('page').alias('page_list'), # combine into list all page events
                        F.countDistinct('sessionId').alias('session_count'), # calculate total number of sessions
                        F.count('song').alias('song_count'), # calculate total number of songs
                        F.first('gender').alias('gender'), # gender data
                        F.last('current_level').alias('current_level'), # take last level value
                        F.max('churn').alias('churn'), 
                        F.min('ts').alias('min_ts'), # start timestamp 
                        F.max('ts').alias('max_ts'), # end timestamp
                        F.last('last_userAgent').alias('last_userAgent'), # recent agent
                        F.min('registration').alias('registration') # registration date
                       )
                   # frequency of sessions
                   .withColumn('session_freq', F.col('session_count')/((F.col('max_ts')-F.col('min_ts'))/TS_COEF))
                   # avg number of songs per session
                   .withColumn('song_per_session', F.col('song_count')/F.col('session_count'))
                   # binary feature: Male = 1/0
                   .withColumn('gender_Male', (F.col('gender')=='M').cast(IntegerType()))
                   # binary feature: paid = 1/0
                   .withColumn('is_paid', (F.col('current_level')=='paid').cast(IntegerType()))
                   # lifetime
                   .withColumn('lifetime', (F.col('max_ts')-F.col('registration'))/TS_COEF)
                   # extract device/OS pointers from agent
                   .withColumn('agent_Windows', F.col('last_userAgent').contains('Windows').cast(IntegerType()))
                   .withColumn('agent_Mac', F.col('last_userAgent').contains('Mac').cast(IntegerType()))
                   .withColumn('agent_iPhone', F.col('last_userAgent').contains('iPhone').cast(IntegerType()))
                   .withColumn('agent_iPad', F.col('last_userAgent').contains('iPad').cast(IntegerType()))
                   .withColumn('agent_Linux', F.col('last_userAgent').contains('Linix').cast(IntegerType()))
                  ).cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Number of unique users in dataset
preprocessed_df.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

22278

In [7]:
# Check target balance
preprocessed_df.groupby('churn').count().show()

# churn <---> 22.5%

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-----+
|churn|count|
+-----+-----+
|    1| 5003|
|    0|17275|
+-----+-----+

**Step 2**: Prepare transformers to collect feature vector

Used features:
* Apply TF-IDF to artist list, song list and page list. We limit vocabSize to 100 elements
* Beside TF-IDF generated features keep session frequency, avg number of songs per session, lifetime, gender, paid, agent based features

In [8]:
def tf_idf_transformer(list_name: str,
                       vocabSize: int=100):
    """
    Combines TF and IDF pyspark transformers
    ------------
    
    Args:
        list_name (string) : prefix of the feature with work list in the format
            prefix_list
        vocabSize (int)    : number of top-output words to keep
    
    Returns:
        tf transformer, idf transformer
    """
    tf = CountVectorizer(inputCol=f"{list_name}_list", outputCol=f"TF_{list_name}", vocabSize=vocabSize)
    tf_idf = IDF(inputCol=f"TF_{list_name}", outputCol=f"TFIDF_{list_name}")
    return tf, tf_idf


artist_tf, artist_tf_idf = tf_idf_transformer('artist')
song_tf, song_tf_idf = tf_idf_transformer('song')
page_tf, page_tf_idf = tf_idf_transformer('page')

assembler = VectorAssembler(inputCols=["TFIDF_artist", "TFIDF_song", "TFIDF_page",
                                       "session_freq", "song_per_session", 
                                       "lifetime", "gender_Male", 
                                       "is_paid", "agent_Windows",
                                       "agent_Mac", "agent_iPhone", "agent_iPad", 
                                       "agent_Linux"], 
                            outputCol="features", 
                            handleInvalid="skip")


feature_pipeline = Pipeline(stages=[artist_tf, artist_tf_idf, 
                                   song_tf, song_tf_idf,
                                   page_tf, page_tf_idf,
                                   assembler
                                   ])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
test = feature_pipeline.fit(preprocessed_df)
test.transform(preprocessed_df).count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

22261

# Modeling
We split the full dataset into train (70%) and test (30%). During cross-validation process train data is additionally split into train and validation subsets. Test data is used only to check the model (nexer seen during training).

We try 2 models:
* Random Forest Classifier
* Gradient Boosted Tree Classifier

Note: since we use tree-based models, we don't don't need to scale numerical features.
Our problem is imbalanced: 23% of positive cases (churn) and 67% of negative (stayed). Thus, we use F1-score to tune hyperparameters and check final quality of the model.

In [28]:
(train_data, test_data) = preprocessed_df.randomSplit([0.7, 0.3], seed=10)

# cache dataframes
train_data = train_data.cache()
test_data = test_data.cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
def score_the_model(test_data, model, metric_name='accuracy'):
    """
    Calculate model score by metric given in metric_name
    """
    # Make predictions
    predictions = model.transform(test_data)

    # Set up evaluator and compute score
    evaluator = MulticlassClassificationEvaluator(
        labelCol="churn", 
        predictionCol="prediction", 
        metricName=metric_name)
    score = evaluator.evaluate(predictions)
    print("Score = ", score)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Random Forest Classifier

In [23]:
# Tune model
rf = RandomForestClassifier(labelCol="churn", featuresCol="features", 
                            seed = 10)
rf_pipeline = Pipeline(stages=[feature_pipeline, rf])

# set parameters grid
paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [5, 7])
            .addGrid(rf.numTrees, [50, 100])
            .build()
            )

# choose evaluater
evaluator = MulticlassClassificationEvaluator(labelCol="churn", 
                                               predictionCol="prediction", 
                                               metricName="f1")

# define cross-validator
crossval = CrossValidator(estimator=rf_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3,
                          seed=10)

# run cross-validation
cvModel = crossval.fit(train_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# check best combination of parameters
cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{Param(parent='RandomForestClassifier_8d88822f3599', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 7, Param(parent='RandomForestClassifier_8d88822f3599', name='numTrees', doc='Number of trees to train (>= 1).'): 50}

In [24]:
# let's test it
score_the_model(test_data, cvModel, metric_name='f1')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Score =  0.9355532787669427

In [26]:
cvModel.bestModel.write().overwrite().save(my_storage + "/saved_models/rf_model")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Gradient Boosted Tree classifier

In [11]:
gbt = GBTClassifier(labelCol="churn", featuresCol="features")
gbt_pipeline = Pipeline(stages=[feature_pipeline, gbt])

cvModel = gbt_pipeline.fit(train_data)

# Make predictions
predictions = cvModel.transform(test_data)

# Set up evaluator and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", 
    predictionCol="prediction", 
    metricName="f1")
score = evaluator.evaluate(predictions)
print("Score = ", score)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Score =  1.0

In [13]:
# Tune model
gbt = GBTClassifier(labelCol="churn", featuresCol="features", seed = 10)
gbt_pipeline = Pipeline(stages=[feature_pipeline, gbt])

# set parameters grid
paramGrid = (ParamGridBuilder()
            .addGrid(gbt.maxDepth, [3, 5])
            .addGrid(gbt.maxIter, [5, 10])
            .build()
            )

# choose evaluater
evaluator = MulticlassClassificationEvaluator(labelCol="churn", 
                                               predictionCol="prediction", 
                                               metricName="f1")

# define cross-validator
crossval = CrossValidator(estimator=gbt_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3,
                          seed=10)

# run cross-validation
cvModel = crossval.fit(train_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exception in thread cell_monitor-13:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 1597



In [15]:
# check best combination of parameters
cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{Param(parent='GBTClassifier_a1d75e21a0bc', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 3, Param(parent='GBTClassifier_a1d75e21a0bc', name='maxIter', doc='max number of iterations (>= 0).'): 5}

In [14]:
# let's test it
score_the_model(test_data, cvModel, metric_name='f1')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Score =  1.0

In [22]:
cvModel.bestModel.write().overwrite().save(my_storage + "/saved_models/gbt_model")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Load pretrained models

In [27]:
# load model
path_dict = {'rf': my_storage + "/saved_models/rf_model",
             'gbt': my_storage + "/saved_models/gbt_model"}

pretrainedModel = PipelineModel.load(path_dict['gbt'])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
# let's test it
score_the_model(test_data, pretrainedModel, metric_name='f1')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Score =  1.0