In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
spark = (
  SparkSession.builder
    .appName("Codeway")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/27 00:06:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/27 00:06:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/27 00:06:15 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
df_with_cohort = spark.read.json("/Users/macbookpro/PyCharmMiscProject/df_with_cohort_json")

                                                                                

In [5]:
df_with_cohort.printSchema()

root
 |-- avg_auto_renew_off: double (nullable = true)
 |-- avg_free_trial: double (nullable = true)
 |-- avg_paywall: double (nullable = true)
 |-- avg_refund: double (nullable = true)
 |-- avg_renewal: double (nullable = true)
 |-- avg_stickiness_ratio: double (nullable = true)
 |-- avg_subscribe: double (nullable = true)
 |-- cohort_index: long (nullable = true)
 |-- cohort_season: string (nullable = true)
 |-- cohort_size: long (nullable = true)
 |-- first_event_date: string (nullable = true)
 |-- iOS: double (nullable = true)
 |-- iPadOS: double (nullable = true)
 |-- mean_event_hour: double (nullable = true)
 |-- mean_revenue_15d: double (nullable = true)
 |-- mean_revenue_1y: double (nullable = true)
 |-- std_auto_renew_off: double (nullable = true)
 |-- std_event_hour: double (nullable = true)
 |-- std_free_trial: double (nullable = true)
 |-- std_paywall: double (nullable = true)
 |-- std_refund: double (nullable = true)
 |-- std_renewal: double (nullable = true)
 |-- std_reve

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

1) Time‐aware train/test split (80/20 by cohort_index)

In [7]:
max_idx    = df_with_cohort.agg(F.max("cohort_index")).first()[0]
split_idx  = int(max_idx * 0.8)
train      = df_with_cohort.filter(F.col("cohort_index") <= split_idx).withColumn("rand", F.rand(seed=12345)).orderBy("rand")
test       = df_with_cohort.filter(F.col("cohort_index")  > split_idx)

                                                                                

 2) Preprocessing pipeline (fit once)

In [8]:
from pyspark import StorageLevel

In [9]:
si_season  = StringIndexer(inputCol="cohort_season", outputCol="season_idx", handleInvalid="keep")
oh_season  = OneHotEncoder(inputCols=["season_idx"], outputCols=["season_vec"])
feature_cols = [
    "cohort_size",
    "iOS", "iPadOS",
    "avg_auto_renew_off","std_auto_renew_off",
    "avg_free_trial","std_free_trial",
    "avg_paywall","std_paywall",
    "avg_refund","std_refund",
    "avg_renewal","std_renewal",
    "avg_subscribe","std_subscribe",
    "mean_event_hour","std_event_hour",
    "mean_revenue_1y","std_revenue_1y",
    "season_vec"
]
assembler  = VectorAssembler(inputCols=feature_cols, outputCol="features")

preproc    = Pipeline(stages=[si_season, oh_season, assembler])
train = train.drop("rand")
pp_model   = preproc.fit(train)

train_pp   = pp_model.transform(train).persist(StorageLevel.MEMORY_AND_DISK)
test_pp    = pp_model.transform(test).persist(StorageLevel.MEMORY_AND_DISK)

25/07/27 00:06:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


3) Define two GBT regressors with explicit prediction columns

In [10]:
gbt_mean = GBTRegressor(
    featuresCol="features",
    labelCol="mean_revenue_15d",
    predictionCol="pred15_mean",
    seed=12345
)
gbt_std  = GBTRegressor(
    featuresCol="features",
    labelCol="std_revenue_15d",
    predictionCol="pred15_std",
    seed=12345
)

 4) Evaluators

In [11]:
evaluator_mean = RegressionEvaluator(
    labelCol="mean_revenue_15d",
    predictionCol="pred15_mean",
    metricName="mae"
)
evaluator_std  = RegressionEvaluator(
    labelCol="std_revenue_15d",
    predictionCol="pred15_std",
    metricName="mae"
)

5) Hyperparameter grids

In [12]:
paramGrid_mean = (ParamGridBuilder()
    .addGrid(gbt_mean.maxDepth, [3, 5])    # try shallow vs. a bit deeper
    .addGrid(gbt_mean.maxIter,  [20])      # just one low-cost value
    .addGrid(gbt_mean.stepSize, [0.1])
    .build()
)

paramGrid_std  = (ParamGridBuilder()
    .addGrid(gbt_std.maxDepth, [3, 5])
    .addGrid(gbt_std.maxIter,  [20])
    .addGrid(gbt_std.stepSize, [0.1])
    .build()
)

6) Cross-validators

In [13]:
cv_mean = CrossValidator(
    estimator=gbt_mean,
    estimatorParamMaps=paramGrid_mean,
    evaluator=evaluator_mean,
    numFolds=3
)

cv_std = CrossValidator(
    estimator=gbt_std,
    estimatorParamMaps=paramGrid_std,
    evaluator=evaluator_std,
    numFolds=3
)

In [14]:
cv_mean.setParallelism(4)
cv_std.setParallelism(4)

CrossValidator_a8ae3d77c3f2

7) Train both models on preprocessed training data

In [15]:
cv_model_mean = cv_mean.fit(train_pp)

25/07/27 00:08:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/07/27 00:09:01 WARN BlockManager: Asked to remove block broadcast_579_piece0, which does not exist
                                                                                

In [16]:
cv_model_std  = cv_std.fit(train_pp)

8) Generate predictions on preprocessed test data

In [17]:
preds = cv_model_mean.transform(test_pp)
preds = cv_model_std .transform(preds)

9) Annualize the 15-day forecasts if desired

In [18]:
factor     = 365.0 / 15.0
std_factor = (365.0 / 15.0) ** 0.5

preds = preds.withColumn("forecast_mean_1y", F.col("pred15_mean") * factor) \
             .withColumn("forecast_std_1y",  F.col("pred15_std")  * std_factor)

 preds now contains:
   - pred15_mean, pred15_std
   - forecast_mean_1y, forecast_std_1y
   - plus all original cohort features for analysis

10) Evaluate

In [19]:
mae_mean = evaluator_mean.evaluate(preds)
mae_std = evaluator_std.evaluate(preds)

                                                                                

In [20]:
print("15-day MAE (mean):", mae_mean)
print("15-day MAE (std) :", mae_std)

15-day MAE (mean): 0.020836992115998255
15-day MAE (std) : 0.09661682129958578


1) Compute both averages together

In [21]:
mean_vals = test_pp.agg(
    F.avg("mean_revenue_15d" ).alias("mean_true_mean"),
    F.avg("std_revenue_15d"  ).alias("mean_true_std")
).first()

mean_true_mean = mean_vals["mean_true_mean"]
mean_true_std  = mean_vals["mean_true_std"]

In [22]:
relative_mae_mean = mae_mean / mean_true_mean
relative_mae_std  = mae_std  / mean_true_std

3) Print as percentages

In [23]:
print(f"Relative MAE (mean): {relative_mae_mean:.2%}")
print(f"Relative MAE (std) : {relative_mae_std:.2%}")

Relative MAE (mean): 4.12%
Relative MAE (std) : 3.41%


# Write the output

preds is the DataFrame obtained after scoring and annualizing:
it contains all cohort features plus forecast_mean_1y & forecast_std_1y.

Select exactly the columns wanted in the output

In [24]:
preds.printSchema()

root
 |-- avg_auto_renew_off: double (nullable = true)
 |-- avg_free_trial: double (nullable = true)
 |-- avg_paywall: double (nullable = true)
 |-- avg_refund: double (nullable = true)
 |-- avg_renewal: double (nullable = true)
 |-- avg_stickiness_ratio: double (nullable = true)
 |-- avg_subscribe: double (nullable = true)
 |-- cohort_index: long (nullable = true)
 |-- cohort_season: string (nullable = true)
 |-- cohort_size: long (nullable = true)
 |-- first_event_date: string (nullable = true)
 |-- iOS: double (nullable = true)
 |-- iPadOS: double (nullable = true)
 |-- mean_event_hour: double (nullable = true)
 |-- mean_revenue_15d: double (nullable = true)
 |-- mean_revenue_1y: double (nullable = true)
 |-- std_auto_renew_off: double (nullable = true)
 |-- std_event_hour: double (nullable = true)
 |-- std_free_trial: double (nullable = true)
 |-- std_paywall: double (nullable = true)
 |-- std_refund: double (nullable = true)
 |-- std_renewal: double (nullable = true)
 |-- std_reve

In [25]:
output_df = preds.select(
    "cohort_index",
    "first_event_date",
    "cohort_season",
    "cohort_size",

    # original cohort features
    "iOS", "iPadOS", "avg_stickiness_ratio",
    "avg_auto_renew_off", "std_auto_renew_off",
    "avg_free_trial",      "std_free_trial",
    "avg_paywall",         "std_paywall",
    "avg_refund",          "std_refund",
    "avg_renewal",         "std_renewal",
    "avg_subscribe",       "std_subscribe",
    "mean_event_hour",     "std_event_hour",
    "mean_revenue_1y",     "std_revenue_1y",

    # 1-year forecasts
    "forecast_mean_1y",
    "forecast_std_1y"
)

In [26]:
output_df.printSchema()

root
 |-- cohort_index: long (nullable = true)
 |-- first_event_date: string (nullable = true)
 |-- cohort_season: string (nullable = true)
 |-- cohort_size: long (nullable = true)
 |-- iOS: double (nullable = true)
 |-- iPadOS: double (nullable = true)
 |-- avg_stickiness_ratio: double (nullable = true)
 |-- avg_auto_renew_off: double (nullable = true)
 |-- std_auto_renew_off: double (nullable = true)
 |-- avg_free_trial: double (nullable = true)
 |-- std_free_trial: double (nullable = true)
 |-- avg_paywall: double (nullable = true)
 |-- std_paywall: double (nullable = true)
 |-- avg_refund: double (nullable = true)
 |-- std_refund: double (nullable = true)
 |-- avg_renewal: double (nullable = true)
 |-- std_renewal: double (nullable = true)
 |-- avg_subscribe: double (nullable = true)
 |-- std_subscribe: double (nullable = true)
 |-- mean_event_hour: double (nullable = true)
 |-- std_event_hour: double (nullable = true)
 |-- mean_revenue_1y: double (nullable = true)
 |-- std_revenue

In [27]:
null_counts = output_df.select([
    F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in output_df.columns
])

In [28]:
null_counts.show()

+------------+----------------+-------------+-----------+---+------+--------------------+------------------+------------------+--------------+--------------+-----------+-----------+----------+----------+-----------+-----------+-------------+-------------+---------------+--------------+---------------+--------------+----------------+---------------+
|cohort_index|first_event_date|cohort_season|cohort_size|iOS|iPadOS|avg_stickiness_ratio|avg_auto_renew_off|std_auto_renew_off|avg_free_trial|std_free_trial|avg_paywall|std_paywall|avg_refund|std_refund|avg_renewal|std_renewal|avg_subscribe|std_subscribe|mean_event_hour|std_event_hour|mean_revenue_1y|std_revenue_1y|forecast_mean_1y|forecast_std_1y|
+------------+----------------+-------------+-----------+---+------+--------------------+------------------+------------------+--------------+--------------+-----------+-----------+----------+----------+-----------+-----------+-------------+-------------+---------------+--------------+------------

In [29]:
output_df.show(10, truncate=False)

+------------+----------------+-------------+-----------+------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----------+----------+--------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|cohort_index|first_event_date|cohort_season|cohort_size|iOS               |iPadOS              |avg_stickiness_ratio|avg_auto_renew_off |std_auto_renew_off |avg_free_trial     |std_free_trial     |avg_paywall       |std_paywall       |avg_refund|std_refund|avg_renewal         |std_renewal        |avg_subscribe       |std_subscribe      |mean_event_hour   |std_event_hour    |mean_revenue_1y   |std_revenue_1y    |forecast_mean_1y  |forecast_std_1y   |
+------------+----------------+-------------+-----------+------------------+--------------

In [30]:
# 1) repartition to use all cores
num_parts = spark.sparkContext.defaultParallelism
df_out = output_df.repartition(num_parts)
print("After:", df_out.rdd.getNumPartitions())

After: 8


In [31]:
# 2) write as compressed JSON
df_out.write \
    .mode("overwrite") \
    .option("compression", "gzip") \
    .json("/Users/macbookpro/PyCharmMiscProject/output_cohort_json")

                                                                                