In [46]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [47]:
spark = (
  SparkSession.builder
    .appName("Codeway")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Read the dataframe

In [48]:
df_features = spark.read.json("/Users/macbookpro/PyCharmMiscProject/df_features_json")

                                                                                

In [52]:
# Order df_features chronologically by first_event_date
df_ordered = df_features.orderBy("first_event_date")

In [51]:
from pyspark.sql.window import Window

In [53]:
w = Window.orderBy("first_event_date")
df_ordered = df_ordered.withColumn("user_index", F.row_number().over(w))

# Pre-processing (very short) for User-level Analysis

In [54]:
df_ordered = df_ordered.drop("first_event_date")

In [8]:
df_ordered.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)



In [55]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

0. ios = iOS

In [56]:
df_ordered = df_ordered.withColumn(
    "operating_system",
    F.when(F.lower(F.col("operating_system")) == "ios", F.lit("iOS"))
    .otherwise(F.col("operating_system"))
)

1) Time‐aware train/test split (80/20 by cohort_index)

In [57]:
# Find cutoff for 80/20 split
total_count = df_ordered.count()
split_row = int(total_count * 0.8)

                                                                                

In [58]:
train      = df_ordered.filter(F.col("user_index") <= split_row).withColumn("rand", F.rand(seed=12345)).orderBy("rand")
test       = df_ordered.filter(F.col("user_index")  > split_row)

In [91]:
test.write \
    .mode("overwrite") \
    .option("compression", "gzip") \
    .json("/Users/macbookpro/PyCharmMiscProject/test_set/user_test_set")

25/07/27 18:06:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 18:06:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 18:06:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 18:06:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 18:06:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 18:06:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 1

 2) Preprocessing pipeline (fit once)

In [59]:
from pyspark import StorageLevel

In [60]:
df_ordered.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)



# User-level Prediction

In [61]:
# For "operating_system"
si_os = StringIndexer(inputCol="operating_system", outputCol="operating_system_idx", handleInvalid="keep")
oh_os = OneHotEncoder(inputCols=["operating_system_idx"], outputCols=["operating_system_vec"])

# For "country"
si_country = StringIndexer(inputCol="country", outputCol="country_idx", handleInvalid="keep")
oh_country = OneHotEncoder(inputCols=["country_idx"], outputCols=["country_vec"])

# For "season"
si_season = StringIndexer(inputCol="season", outputCol="season_idx", handleInvalid="keep")
oh_season = OneHotEncoder(inputCols=["season_idx"], outputCols=["season_vec"])

feature_cols = [
    "auto_renew_off",
    "free_trial",
    "paywall",
    "refund",
    "renewal",
    "subscribe",
    "avg_event_hour",
    "total_revenue",
    "first_year_revenue",
    "stickiness_ratio",
    "season_vec",
    "country_vec",
    "operating_system_vec"
]
assembler  = VectorAssembler(inputCols=feature_cols, outputCol="features")

preproc    = Pipeline(stages=[
    si_os, oh_os,
    si_country, oh_country,
    si_season, oh_season
    , assembler])
train = train.drop("rand")
pp_model   = preproc.fit(train)

train_pp   = pp_model.transform(train).persist(StorageLevel.MEMORY_AND_DISK)
test_pp    = pp_model.transform(test).persist(StorageLevel.MEMORY_AND_DISK)

25/07/27 17:42:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 1

1) Define GBT regressor with explicit prediction columns

In [62]:
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="total_revenue",
    predictionCol="pred15",
    seed=12345
)

2) Evaluators

In [63]:
evaluator = RegressionEvaluator(
    labelCol="total_revenue",
    predictionCol="pred15",
    metricName="mae"
)

3) Hyperparameter grids

In [64]:
paramGrid = (ParamGridBuilder()
    .addGrid(gbt.maxDepth, [3])
    .addGrid(gbt.maxIter,  [10])
    .addGrid(gbt.stepSize, [0.1])
    .build()
)

Define Pipeline for importing the model to other notebooks

In [65]:
# build a full pipeline that applies pp_model then the GBT
pipeline = Pipeline(stages=[pp_model, gbt])

4) Cross-validators

In [66]:
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [67]:
cv.setParallelism(4)

CrossValidator_3b7a1d7ae690

5) Train the model on preprocessed training data & Save the Model to Disk

In [68]:
cv_model = cv.fit(train)
bestModel = cv_model.bestModel    # the PipelineModel tuned for the label
bestModel.write().overwrite().save("/Users/macbookpro/PyCharmMiscProject/models/user_ltv_model")

25/07/27 17:42:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:42:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 1

6) Generate predictions on preprocessed test data

In [77]:
preds = cv_model.transform(test)

7) Annualize the 15-day forecasts if desired

In [80]:
factor     = 365.0 / 15.0

preds = preds.withColumn("forecast_1y", F.col("pred15") * factor)

In [81]:
# write "preds" out once—and forget Spark ML for downstream analysis
preds.write.mode("overwrite").parquet("/Users/macbookpro/PyCharmMiscProject/Output_Dataframes/user_preds.parquet")

25/07/27 17:55:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:55:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:55:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:55:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:55:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:55:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 1

 preds now contains:
   - pred15
   - forecast_1y
   - plus all original user features for analysis

# Evaluate

In [72]:
mae = evaluator.evaluate(preds)

25/07/27 17:50:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 1

In [73]:
print("15-day MAE:", mae)

15-day MAE: 0.016813497452963934


1) Compute both averages together

In [74]:
mean_vals = test_pp.agg(
    F.avg("total_revenue" ).alias("true_mean"),
).first()

true_mean = mean_vals["true_mean"]

25/07/27 17:50:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:50:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [75]:
relative_mae_mean = mae / true_mean

2) Print as percentages

In [76]:
print(f"Relative MAE (mean): {relative_mae_mean:.2%}")

Relative MAE (mean): 3.36%


# Write the Output

preds is the DataFrame obtained after scoring and annualizing:
it contains all user features plus pred15 and forecast_1y.

Select exactly the columns wanted in the output

In [82]:
preds.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)
 |-- operating_system_idx: double (nullable = false)
 |-- operating_system_vec: vector (nullable = true)
 |-- country_idx: double (nullable = false)
 |-- country_vec: vector (nullable = true)
 |-- season_idx: double (nullable = false)
 |-- season_vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- pred15: double (nullable = false)
 |-- fore

In [83]:
output_df = preds.select(
    "user_id",
    "operating_system_idx",
    "country_idx",
    "season_idx",
    "stickiness_ratio",
    "auto_renew_off",
    "free_trial",
    "paywall",
    "refund",
    "renewal",
    "subscribe",
    "avg_event_hour",
    "total_revenue",
    "first_year_revenue",

    # 1-year forecasts
    "forecast_1y"
)

In [84]:
output_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- operating_system_idx: double (nullable = false)
 |-- country_idx: double (nullable = false)
 |-- season_idx: double (nullable = false)
 |-- stickiness_ratio: double (nullable = true)
 |-- auto_renew_off: long (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- forecast_1y: double (nullable = false)



In [85]:
# list all columns except the three we skip
skip = {"user_id"}
cols = [c for c in output_df.columns if c not in skip]

# build one aggregation per column: count where value < 0
agg_exprs = [
    F.sum(F.when(F.col(c) < 0, 1).otherwise(0)).alias(c)
    for c in cols
]

# run and display
neg_counts_df = output_df.agg(*agg_exprs)
neg_counts_df.show(truncate=False)

25/07/27 17:56:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:56:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:56:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:56:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:56:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:56:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
          

+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|operating_system_idx|country_idx|season_idx|stickiness_ratio|auto_renew_off|free_trial|paywall|refund|renewal|subscribe|avg_event_hour|total_revenue|first_year_revenue|forecast_1y|
+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|0                   |0          |0         |0               |0             |0         |0      |0     |0      |0        |0             |0            |8                 |417542     |
+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+



In [86]:
output_df.select("forecast_1y").filter(F.col("forecast_1y") < 0).distinct().show()

25/07/27 17:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:57:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 247

+--------------------+
|         forecast_1y|
+--------------------+
|-0.07115095152852956|
+--------------------+



                                                                                

The above negative forecast can be basically (and plausibly) considered 0.

In [87]:
null_counts = output_df.select([
    F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in output_df.columns
])

In [88]:
null_counts.show()

25/07/27 17:58:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:58:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:58:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:58:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:58:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:58:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 248

+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|user_id|operating_system_idx|country_idx|season_idx|stickiness_ratio|auto_renew_off|free_trial|paywall|refund|renewal|subscribe|avg_event_hour|total_revenue|first_year_revenue|forecast_1y|
+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|      0|                   0|          0|         0|               0|             0|         0|      0|     0|      0|        0|             0|            0|                 0|          0|
+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+



                                                                                

In [89]:
# 1) repartition to use all cores
num_parts = spark.sparkContext.defaultParallelism
df_out = output_df.repartition(num_parts)
print("After:", df_out.rdd.getNumPartitions())

25/07/27 17:59:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 249

After: 8


In [90]:
# 2) write as compressed JSON
df_out.write \
    .mode("overwrite") \
    .option("compression", "gzip") \
    .json("/Users/macbookpro/PyCharmMiscProject/output_user_json")

25/07/27 17:59:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/27 17:59:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
          