# Start Session

In [1]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql import types as t
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1621607805467_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# create a Spark session
spark = SparkSession \
    .builder \
    .appName("sparkify") \
    .getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load Data
Here the entire set of 26 million rows is loaded. 

In [3]:
# full dataset
event_data = 's3n://udacity-dsnd/sparkify/sparkify_event_data.json'
# mini dataset
# evnent_data = s3n://udacity-dsnd/sparkify/mini_sparkify_event_data.json

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df = spark.read.json(event_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

In [6]:
# the size of the dataset
df = df.filter(df.auth != "Guest")
print((df.count(), len(df.columns)))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(26255005, 18)

### How Many Nulls? 

In [7]:
# show percent nulls
# because 'artist','length','song' have so many missing values I will not use them
df.select(
    [
        ( (F.count(F.when(F.isnull(c), c)) / F.count(c)) * 100 ) \
            .alias(c) for c in df.columns
    ]
).show(vertical=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0---------------------------
 artist        | 25.921642652911192 
 auth          | 0.0                
 firstName     | 3.038709267242056  
 gender        | 3.038709267242056  
 itemInSession | 0.0                
 lastName      | 3.038709267242056  
 length        | 25.921642652911192 
 level         | 0.0                
 location      | 3.038709267242056  
 method        | 0.0                
 page          | 0.0                
 registration  | 3.038709267242056  
 sessionId     | 0.0                
 song          | 25.921642652911192 
 status        | 0.0                
 ts            | 0.0                
 userAgent     | 3.038709267242056  
 userId        | 0.0

### Define Churn
`Churn` is defined as when the app registers the `Cancellation Confirmation` page. This occurs for both free and paid users. 

In [8]:
# this is a binary, indicator column to be used to predict churn
df = df.withColumn(
    'churn',
    F.when((F.col('page') =='Cancellation Confirmation'), 1) \
    .otherwise(0)
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Explore Ratios of Churn

In [9]:
# users who have a cancellation value of 1
users_who_quit = df.filter(df.churn==1) \
    .select(df.userId) \
    .dropDuplicates()

# users who have a cancellation value of 0
users_who_stayed = df.filter(df.churn==0) \
    .select(df.userId) \
    .dropDuplicates()

# examine their lengths
users_who_quit.count(), users_who_stayed.count(), ( users_who_quit.count() / (users_who_quit.count() + users_who_stayed.count())) * 100

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(5003, 22278, 18.338770572926215)

In [10]:
# are cancellation rates the same between free and paid users? 
churn_table = df.select("level","churn","userId") \
    .dropDuplicates() \
    .groupBy(F.col('level'),F.col('churn')) \
    .count() \
    .withColumnRenamed('count','user_count')

churn_table.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-----+----------+
|level|churn|user_count|
+-----+-----+----------+
| paid|    0|     16185|
| free|    0|     18793|
| paid|    1|      3424|
| free|    1|      1579|
+-----+-----+----------+

In [11]:
# 17% of paid and 8% of free users churned
( 3424 / ( 16185 + 3424)) * 100, ( 1579 / ( 18793 + 1579)) * 100

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(17.461369779183027, 7.75083447869625)

In [12]:
# are cancellation rates the same between genders? 
churn_table = df.select("gender","churn","userId") \
    .dropDuplicates() \
    .groupBy(F.col('gender'),F.col('churn')) \
    .count() \
    .withColumnRenamed('count','user_count')

churn_table.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+-----+----------+
|gender|churn|user_count|
+------+-----+----------+
|     M|    1|      2656|
|     M|    0|     11651|
|     F|    0|     10626|
|  null|    0|         1|
|     F|    1|      2347|
+------+-----+----------+

# Feature Engineering
The goal is to predict behavior at the user level--specifically if the user 'churns' or not. Therefore, my feature engineering focuses on finding characteristics at the `user` level. For each feature, I create a dataframe with the `userId` as the key. The dataframes are joined together, ensuring that one row eqauls one unique `userId`. Finally, the `userId` column is dropped. 

Ideas for features are: 
1. Cumulative Totals Per User: 
    - Add Friends
    - Add to Playlist
    - Roll Advertisement
    - Error
    - Login / Logout
    - Thumbs Up / Thumbs Down
    - Submit Upgrade / Submit Downgrade

2. Location: 
    - City
    - State
    
3. userAgent: 
    - browser
    - os 
    - brand
    
4. Time: 
    - Avg. Length of Session
    - Time Spent as Free / Paid Level
    - Time User spent on Sparkify
    
5. Gender of User

### 1. Cumulative Totals DataFrame

In [13]:
# Cumulative Totals of Page Per User
page_list = ["Add Friend", "Add to Playlist", "Logout", "Thumbs Up", 
             "Thumbs Down", "Error", "Roll Advert", "Help"]

user_page = df.select("userId","page") \
    .filter(F.col("page").isin(page_list)) \
    .groupBy("userId") \
    .pivot("page") \
    .agg(F.count("page").alias("page_count"))

user_page.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----------+---------------+-----+----+------+-----------+-----------+---------+
| userId|Add Friend|Add to Playlist|Error|Help|Logout|Roll Advert|Thumbs Down|Thumbs Up|
+-------+----------+---------------+-----+----+------+-----------+-----------+---------+
|1114507|        19|             13| null|   6|    17|         13|         11|       47|
|1390009|         3|              5|    1|null|  null|       null|          2|       15|
|1440693|         4|              9| null|   1|     3|         25|          2|       19|
|1507765|         4|              8| null|   3|     5|       null|          2|       27|
|1624220|        10|             14|    1|   6|    10|         16|          4|       27|
|1829495|         7|             12| null|   1|     2|          6|          2|       16|
|1380035|        35|             55| null|  14|    25|          9|         27|      181|
|1337238|         4|              5| null|   2|     3|         24|          5|       17|
|1816626|         2| 

### 2. Location DataFrame

In [14]:
# split the 'location' field into 'city' and 'state fields'
user_location = df.withColumn('city', F.split(df['location'], ',').getItem(0)) \
    .withColumn('state', F.split(df['location'], ',').getItem(1)) \
    .select("userId","city","state") \
    .dropDuplicates()

user_location.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+---------+
| userId|                city|    state|
+-------+--------------------+---------+
|1717762|North Port-Saraso...|       FL|
|1585384|Seattle-Tacoma-Be...|       WA|
|1376912|Los Angeles-Long ...|       CA|
|1268300|San Antonio-New B...|       TX|
|1466490|Seattle-Tacoma-Be...|       WA|
|1855442|New York-Newark-J...| NY-NJ-PA|
|1267740|         Tallahassee|       FL|
|1379500|  Ogdensburg-Massena|       NY|
|1608368|Seattle-Tacoma-Be...|       WA|
|1303002|        Williamsport|       PA|
|1005928|         Baton Rouge|       LA|
|1343204|       Oklahoma City|       OK|
|1626875|  Vineland-Bridgeton|       NJ|
|1804024|Los Angeles-Long ...|       CA|
|1571170|Boston-Cambridge-...|    MA-NH|
|1651815|              Albany|       OR|
|1672579|San Francisco-Oak...|       CA|
|1612851|             El Paso|       TX|
|1015368|Dallas-Fort Worth...|       TX|
|1230631|       Coeur d'Alene|       ID|
+-------+--------------------+---------+
only showing top

### 3. Browser/Device DataFrame

In [15]:
# create browser, os, brand from userAgent
df = df.withColumn('browser', F.split(df['userAgent'], '\(').getItem(0)) \
    .withColumn('temp', F.split(df['userAgent'], '\(').getItem(1)) 

df = df.withColumn('os', F.split(df['temp'], ';').getItem(0)) \
    .withColumn('brand', F.split(df['temp'], ';').getItem(1))

user_browser = df.select("userId","browser","os","brand").dropDuplicates()

user_browser.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------------+--------------------+--------------------+
| userId|      browser|                  os|               brand|
+-------+-------------+--------------------+--------------------+
|1697168| Mozilla/5.0 |           Macintosh| Intel Mac OS X 10.9|
|1519390|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1086394|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1767318|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1950440|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1794598|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1361533|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1930998| Mozilla/5.0 |          compatible|           MSIE 10.0|
|1171472|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1869054|"Mozilla/5.0 |              iPhone| CPU iPhone OS 7_...|
|1617611| Mozilla/5.0 |          compatible|            MSIE 9.0|
|1391043|"Mozilla/5.0 |           Macintosh| Intel Mac OS X 1...|
|1556745|"

### 4. Time DataFrames

In [16]:
# create window partition
w = Window.partitionBy("userId","sessionId").orderBy("sessionId")
# calculate session duration ( per user ) in HOURS
user_sess_dur = df.select("userId","sessionId","ts") \
    .withColumn("max_ts", F.max('ts').over(w)) \
    .withColumn("min_ts", F.min('ts').over(w)) \
    .withColumn("tsDiff", ( F.col('max_ts') - F.col('min_ts')) / ( 1000 * 3600 )) \
    .select("userId","sessionId","tsDiff") \
    .dropDuplicates() \
    .groupBy("userId") \
    .agg(F.round(F.avg("tsDiff"),1).alias("avgSessionHours"))

user_sess_dur.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---------------+
| userId|avgSessionHours|
+-------+---------------+
|1076191|            1.4|
|1142513|            3.3|
|1271218|            6.9|
|1380035|            5.5|
|1396135|            7.6|
|1472901|            2.0|
|1492713|            1.8|
|1567623|            5.0|
|1624220|            1.8|
|1097545|            2.4|
|1216358|            2.1|
|1242455|            6.9|
|1367666|            8.8|
|1766909|            2.1|
|1816626|            4.9|
|1883991|            7.6|
|1927014|            5.7|
|1004316|            4.6|
|1008404|            6.5|
|1133319|            4.4|
+-------+---------------+
only showing top 20 rows

In [17]:
# create window partition
w = Window.partitionBy("userId").orderBy("userId")
# calculate total time used
user_days = df.select("userId","ts") \
    .withColumn("max_ts", F.max('ts').over(w)) \
    .withColumn("min_ts", F.min('ts').over(w)) \
    .withColumn("daysUsed", F.round(( F.col('max_ts') - F.col('min_ts') ) / ( 1000 * 3600 * 24), 1)) \
    .select("userId", "daysUsed") \
    .dropDuplicates()

user_days.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+
| userId|daysUsed|
+-------+--------+
|1001393|    58.6|
|1002143|    57.2|
|1002493|     8.4|
|1002749|    55.9|
|1004060|    48.0|
|1004316|    53.2|
|1006033|    45.2|
|1006411|    59.5|
|1006697|    50.9|
|1008110|    38.1|
|1008244|     3.3|
|1008404|    39.0|
|1010669|    60.0|
|1010907|    49.2|
|1011093|    57.7|
|1011149|    50.2|
|1011630|    31.3|
|1012906|    10.0|
|1013788|    37.6|
|1016331|    23.5|
+-------+--------+
only showing top 20 rows

In [18]:
# create window partition
w = Window.partitionBy("userId", "level").orderBy("userId")

user_level = df.select("userId", "level", "ts") \
    .withColumn("max_ts", F.max("ts").over(w)) \
    .withColumn("min_ts", F.min("ts").over(w)) \
    .withColumn("daysAs", (F.col("max_ts") - F.col("min_ts")) / (1000 * 3600 * 24)) \
    .select("userId","level","daysAs") \
    .dropDuplicates() \
    .groupBy("userId") \
    .pivot("level") \
    .agg(F.round(F.avg("daysAs"),1))

user_level.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----+----+
| userId|free|paid|
+-------+----+----+
|1076191| 7.9|null|
|1114507|15.8|43.9|
|1271218|44.2|53.4|
|1695825| 0.0|null|
|1216358|32.9|10.3|
|1337238|56.1|null|
|1823263|33.8|null|
|1553256| 5.5|31.1|
|1754369|36.5| 9.0|
|1844759| 0.0|56.6|
|1857514|46.0|11.5|
|1416126|24.9|33.1|
|1877212| 0.2|60.0|
|1303829|39.1|null|
|1460896|56.3|32.9|
|1558241|52.6| 5.7|
|1616882|12.5| 4.6|
|1269506| 0.0| 0.3|
|1479463| 5.5|41.8|
|1011093|57.7|29.7|
+-------+----+----+
only showing top 20 rows

### 5. Gender and Churn DataFrame

In [19]:
# create churn table
user_churn = df.select("userId", "gender","churn") \
    .withColumnRenamed("churn","label") \
    .dropDuplicates()

user_churn.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+------+-----+
| userId|gender|label|
+-------+------+-----+
|1304259|     F|    0|
|1053226|     M|    0|
|1390064|     M|    0|
|1590560|     M|    0|
|1068767|     M|    0|
|1629144|     F|    0|
|1564892|     F|    0|
|1383619|     F|    0|
|1531101|     M|    0|
|1068404|     M|    0|
|1560520|     M|    0|
|1031726|     F|    0|
|1504984|     M|    0|
|1186591|     F|    0|
|1698484|     M|    0|
|1646934|     M|    0|
|1186016|     F|    0|
|1749437|     F|    0|
|1168321|     F|    0|
|1767969|     F|    0|
+-------+------+-----+
only showing top 20 rows

### Join DataFrames

In [40]:
# 'user_location' and 'user_days' are not used. 
# 'user_location' does not describe user behavior
# 'user_days' is the sum of 'free' and 'paid' in 'user_level' so could be a confounder
# join all the tables together
user_data = user_churn.join(user_page, ['userId'], "left") \
    .join(user_browser,['userId'], "left") \
    .join(user_sess_dur, ['userId'], "left") \
    .join(user_level, ['userId'], "left") \
    .drop("userId")

user_data.show(1, vertical=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0-------------------------------
 gender          | M                    
 label           | 0                    
 Add Friend      | 11                   
 Add to Playlist | 18                   
 Error           | 1                    
 Help            | 6                    
 Logout          | 11                   
 Roll Advert     | null                 
 Thumbs Down     | 6                    
 Thumbs Up       | 43                   
 browser         | "Mozilla/5.0         
 os              | Windows NT 5.1) A... 
 brand           | null                 
 avgSessionHours | 5.2                  
 free            | 0.1                  
 paid            | 58.5                 
only showing top 1 row

In [41]:
# final size of dataset
print((user_data.count(), len(user_data.columns)))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(27281, 16)

### Replace Nulls

In [43]:
# define list of numeric columns
numeric_cols = user_page.columns[1:] + user_sess_dur.columns[1:] + user_level.columns[1:]
# create dictionary specifying that 0 will be used to fill missing rows
impute_numeric = { i : 0 for i in numeric_cols }
# fill missing rows in numeric columns
user_data = user_data.fillna(impute_numeric)
# define list of category columns (strings)
categ_cols = user_browser.columns[1:] + ['gender']
# create dictionary specifying that 'missing' will be used to fill missing rows
impute_categ = { i : 'missing' for i in categ_cols }
# fill missing rows in category columns
user_data = user_data.fillna(impute_categ)
# show % nulls - should be 0 for every column
user_data.select(
    [
        ( (F.count(F.when(F.isnull(c), c)) / F.count(c)) * 100 ) \
            .alias(c) for c in user_data.columns
    ]
).show(vertical=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------
 gender          | 0.0 
 label           | 0.0 
 Add Friend      | 0.0 
 Add to Playlist | 0.0 
 Error           | 0.0 
 Help            | 0.0 
 Logout          | 0.0 
 Roll Advert     | 0.0 
 Thumbs Down     | 0.0 
 Thumbs Up       | 0.0 
 browser         | 0.0 
 os              | 0.0 
 brand           | 0.0 
 avgSessionHours | 0.0 
 free            | 0.0 
 paid            | 0.0

# Modeling

### Split into Training / Testing Sets

In [23]:
training, testing = user_data.randomSplit([0.6, 0.4], seed=100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Create Pipeline

In [44]:
# create list of category output columns from previously defined categ_columns
o_categ_cols = [ n + 'Index' for n in categ_cols ] 
# encode categories in pipeline
categ_pipeline = Pipeline(stages=[
    StringIndexer(inputCols=categ_cols, outputCols=o_categ_cols, handleInvalid='skip'),
    VectorAssembler(
        inputCols=o_categ_cols, 
        outputCol="categ_features"
    )
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
# numeric pipeline - use numeric columns defined earlier
numer_pipeline = Pipeline(stages=[
    VectorAssembler(inputCols=numeric_cols, outputCol="numer_features")
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
# unite pipelines ( final features have to be named 'features')
# unite pipelines
final_pipeline = Pipeline(stages=[
    categ_pipeline,
    numer_pipeline,
    VectorAssembler(
        inputCols=["categ_features","numer_features"],
        outputCol="features"
    ),
    LogisticRegression(maxIter=10)
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Cross-Validate on Training Set

In [31]:
# define parameters ( just one )
paramGrid = ParamGridBuilder() \
    .addGrid(LogisticRegression.regParam, [0.1, 0.05, 0.01]) \
    .build()

# optimize 4 fold cross-validation using the area under ROC curve
crossval = CrossValidator(estimator=final_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC'),
                          numFolds=3) 

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [32]:
cvModel = crossval.fit(training)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
# certainly looks like the regParam is fine at 0.1
cvModel.avgMetrics

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[0.7769540579729164, 0.7769540579729164, 0.7769540579729164]

### Fit Tuned Pipeline on Entire Training Set

In [46]:
# unite pipelines ( final features have to be named 'features')
# unite pipelines
final_pipeline2 = Pipeline(stages=[
    categ_pipeline,
    numer_pipeline,
    VectorAssembler(
        inputCols=["categ_features","numer_features"],
        outputCol="features"
    ),
    LogisticRegression(maxIter=10, regParam=0.1)
])

pipelineModel = final_pipeline2.fit(training)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [47]:
# show accuracy and AUC
summaryPipeline = pipelineModel.stages[-1].summary
summaryPipeline.accuracy, summaryPipeline.areaUnderROC

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(0.8162691131498471, 0.7678928649578044)

### Calculate Odds-Ratios

In [48]:
import numpy as np

coeffs = pipelineModel.stages[-1].coefficientMatrix
odds_ratios = np.around(np.exp(coeffs.toArray()),2).tolist()
feature_list = o_categ_cols + numeric_cols
odds_ratios_df = spark.createDataFrame(
    [ t for t in zip(odds_ratios[0], feature_list)],
    ["or", "feature"]
)

odds_ratios_df.show(30)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+---------------+
|  or|        feature|
+----+---------------+
|0.99|   browserIndex|
|0.99|        osIndex|
| 1.0|     brandIndex|
|0.99|    genderIndex|
| 1.0|     Add Friend|
| 1.0|Add to Playlist|
| 1.0|          Error|
| 1.0|           Help|
| 1.0|         Logout|
|1.01|    Roll Advert|
|1.01|    Thumbs Down|
| 1.0|      Thumbs Up|
|1.01|avgSessionHours|
|0.98|           free|
|0.99|           paid|
+----+---------------+

### Score on Testing Set

In [51]:
# generate predictions
predictions = pipelineModel.transform(testing)
# intialize the evaluator
bce = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderROC').evaluate(predictions)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [57]:
# area-under-receiver-operating-characteristic-curve of testing set
bce

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.754797280208676

# Results and Conclusion

Splitting the user data into training and testing sets (60% and 40%, respectively) was sufficient. The final AUC was: 
* 0.76 for the training set 
* 0.75 for the testing set

which is close enough together to suggest that under-fitting, and not over-fitting, was a possible issue. 

The odds-ratios showed that some features ( regarding user behavior ) were associated with an increased probability of 'churn' while others were associated with a _decreased_ probability of churn. Specifically: 

A _one_ unit increase of: 
* the number of advertisements
* the number of 'thumbs down' 
* the average number of hours a session lasted

_increased_ the probability of 'churning' by 1%

whereas a _one_ unit increase of: 
* browser
* os
* gender
* days as a 'paid' user

_decreased_ the probability of 'churning' by 1%. In the case of the 'category' column type, the 'one unit increase' is really the act of moving from the base class to the next class. For example, moving from _Female_ (the base class) to _Male_ decreased the probability of 'churning.' This seems to indicate that _Females_ were more likely to 'churn.' 

Finally, a _one_ unit increase of: 
* days spent at the 'free' level

_decreased_ the probability of 'churning' by _2%_

The recommendation based on these findings is that Sparkify should investigate ways to: 
* keep the female audience engaged
* encourage users to login everyday
* limit the number of advertisements a user has to watch
* decrease the number of 'thumbs down' that a user feels compelled to give

Sparkify should also investigate if there are issues with the service via a particular Internet Browser or OS. 

Realistically, these recommendations have no weight until the accuracy / AUC can be increased. Thank you for reading!