In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
  SparkSession.builder
    .appName("Codeway")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/26 23:42:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/26 23:42:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Read the dataframe

In [3]:
df_features = spark.read.json("/Users/macbookpro/PyCharmMiscProject/df_features_json")

                                                                                

In [4]:
# Order df_features chronologically by first_event_date
df_ordered = df_features.orderBy("first_event_date")

In [5]:
from pyspark.sql.window import Window

In [6]:
w = Window.orderBy("first_event_date")
df_ordered = df_ordered.withColumn("user_index", F.row_number().over(w))

# Pre-processing (very short) for User-level Analysis

In [7]:
df_ordered = df_ordered.drop("first_event_date")

In [8]:
df_ordered.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)



In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

0. ios = iOS

In [9]:
df_ordered = df_ordered.withColumn(
    "operating_system",
    F.when(F.lower(F.col("operating_system")) == "ios", F.lit("iOS"))
    .otherwise(F.col("operating_system"))
)

1) Time‐aware train/test split (80/20 by cohort_index)

In [10]:
# Find cutoff for 80/20 split
total_count = df_ordered.count()
split_row = int(total_count * 0.8)

                                                                                

In [11]:
train      = df_ordered.filter(F.col("user_index") <= split_row).withColumn("rand", F.rand(seed=12345)).orderBy("rand")
test       = df_ordered.filter(F.col("user_index")  > split_row)

 2) Preprocessing pipeline (fit once)

In [12]:
from pyspark import StorageLevel

In [13]:
df_ordered.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)



In [18]:
# For "operating_system"
si_os = StringIndexer(inputCol="operating_system", outputCol="operating_system_idx", handleInvalid="keep")
oh_os = OneHotEncoder(inputCols=["operating_system_idx"], outputCols=["operating_system_vec"])

# For "country"
si_country = StringIndexer(inputCol="country", outputCol="country_idx", handleInvalid="keep")
oh_country = OneHotEncoder(inputCols=["country_idx"], outputCols=["country_vec"])

# For "season" (from your sample, adjusted for naming consistency)
si_season = StringIndexer(inputCol="season", outputCol="season_idx", handleInvalid="keep")
oh_season = OneHotEncoder(inputCols=["season_idx"], outputCols=["season_vec"])

feature_cols = [
    "auto_renew_off",
    "free_trial",
    "paywall",
    "refund",
    "renewal",
    "subscribe",
    "avg_event_hour",
    "total_revenue",
    "first_year_revenue",
    "stickiness_ratio",
    "season_vec",
    "country_vec",
    "operating_system_vec"
]
assembler  = VectorAssembler(inputCols=feature_cols, outputCol="features")

preproc    = Pipeline(stages=[
    si_os, oh_os,
    si_country, oh_country,
    si_season, oh_season
    , assembler])
train = train.drop("rand")
pp_model   = preproc.fit(train)

train_pp   = pp_model.transform(train).persist(StorageLevel.MEMORY_AND_DISK)
test_pp    = pp_model.transform(test).persist(StorageLevel.MEMORY_AND_DISK)

25/07/26 23:43:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:43:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:44:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:44:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:44:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:44:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 2

3) Define GBT regressor with explicit prediction columns

In [19]:
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="total_revenue",
    predictionCol="pred15",
    seed=12345
)

 4) Evaluators

In [20]:
evaluator = RegressionEvaluator(
    labelCol="total_revenue",
    predictionCol="pred15",
    metricName="mae"
)

5) Hyperparameter grids

In [21]:
paramGrid = (ParamGridBuilder()
    .addGrid(gbt.maxDepth, [3])      # only one depth
    .addGrid(gbt.maxIter,  [10])     # lower number of boosting iterations
    .addGrid(gbt.stepSize, [0.1])    # keep existing, or try a bigger value for even faster training
    .build()
)

6) Cross-validators

In [22]:
cv = CrossValidator(
    estimator=gbt,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [23]:
cv.setParallelism(4)

CrossValidator_24f4f6eea8b6

7) Train the model on preprocessed training data

In [24]:
cv_model = cv.fit(train_pp)

25/07/26 23:45:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:45:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:45:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:45:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:48:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

8) Generate predictions on preprocessed test data

In [26]:
preds = cv_model.transform(test_pp)

9) Annualize the 15-day forecasts if desired

In [27]:
factor     = 365.0 / 15.0

preds = preds.withColumn("forecast_1y", F.col("pred15") * factor)

 preds now contains:
   - pred15
   - forecast_1y
   - plus all original user features for analysis

10) Evaluate

In [28]:
mae = evaluator.evaluate(preds)

25/07/26 23:55:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:55:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:55:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/26 23:55:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [29]:
print("15-day MAE:", mae)

15-day MAE: 0.016813497452963934


1) Compute both averages together

In [30]:
mean_vals = test_pp.agg(
    F.avg("total_revenue" ).alias("true_mean"),
).first()

true_mean = mean_vals["true_mean"]

In [31]:
relative_mae_mean = mae / true_mean

3) Print as percentages

In [32]:
print(f"Relative MAE (mean): {relative_mae_mean:.2%}")

Relative MAE (mean): 3.36%


preds is the DataFrame obtained after scoring and annualizing:
it contains all user features plus pred15 and forecast_1y.

Select exactly the columns wanted in the output

In [33]:
preds.printSchema()

root
 |-- auto_renew_off: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- country: string (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- operating_system: string (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- season: string (nullable = true)
 |-- stickiness_ratio: double (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_index: integer (nullable = false)
 |-- operating_system_idx: double (nullable = false)
 |-- operating_system_vec: vector (nullable = true)
 |-- country_idx: double (nullable = false)
 |-- country_vec: vector (nullable = true)
 |-- season_idx: double (nullable = false)
 |-- season_vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- pred15: double (nullable = false)
 |-- fore

In [35]:
output_df = preds.select(
    "user_id",
    "operating_system_idx",
    "country_idx",
    "season_idx",
    "stickiness_ratio",
    "auto_renew_off",
    "free_trial",
    "paywall",
    "refund",
    "renewal",
    "subscribe",
    "avg_event_hour",
    "total_revenue",
    "first_year_revenue",

    # your 1-year forecasts
    "forecast_1y"
)

In [36]:
output_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- operating_system_idx: double (nullable = false)
 |-- country_idx: double (nullable = false)
 |-- season_idx: double (nullable = false)
 |-- stickiness_ratio: double (nullable = true)
 |-- auto_renew_off: long (nullable = true)
 |-- free_trial: long (nullable = true)
 |-- paywall: long (nullable = true)
 |-- refund: long (nullable = true)
 |-- renewal: long (nullable = true)
 |-- subscribe: long (nullable = true)
 |-- avg_event_hour: double (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- first_year_revenue: double (nullable = true)
 |-- forecast_1y: double (nullable = false)



In [37]:
null_counts = output_df.select([
    F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in output_df.columns
])

In [38]:
null_counts.show()

+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|user_id|operating_system_idx|country_idx|season_idx|stickiness_ratio|auto_renew_off|free_trial|paywall|refund|renewal|subscribe|avg_event_hour|total_revenue|first_year_revenue|forecast_1y|
+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+
|      0|                   0|          0|         0|               0|             0|         0|      0|     0|      0|        0|             0|            0|                 0|          0|
+-------+--------------------+-----------+----------+----------------+--------------+----------+-------+------+-------+---------+--------------+-------------+------------------+-----------+



                                                                                

In [42]:
# 1) repartition to use all cores
num_parts = spark.sparkContext.defaultParallelism
df_out = output_df.repartition(num_parts)
print("After:", df_out.rdd.getNumPartitions())

[Stage 607:>                                                        (0 + 1) / 1]

After: 8


In [44]:
# 2) write as compressed JSON
df_out.write \
    .mode("overwrite") \
    .option("compression", "gzip") \
    .json("/Users/macbookpro/PyCharmMiscProject/output_user_json")

                                                                                