# Training and evaluation of the model

BRAUX Owen and CAMBIER Elliot

    This notebook prepares the final dataset for machine learning, defines a prediction target, trains a model, and evaluates it.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lead, when, window, sum, count, avg, from_unixtime, max, stddev
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier 

In [2]:
spark = SparkSession.builder \
    .appName("BDA - Modeling and Evaluation") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/15 11:00:45 WARN Utils: Your hostname, OBPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/15 11:00:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/15 11:00:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load

In [3]:
features_path = "../data/processed/features.parquet"
# Load the data to not recalculate the join
df_ml = spark.read.parquet(features_path)

print("Successfully loaded feature data.")
df_ml.printSchema()
df_ml.show(5)

                                                                                

Successfully loaded feature data.
root
 |-- timestamp_utc: timestamp (nullable = true)
 |-- unix_timestamp: double (nullable = true)
 |-- price_open: double (nullable = true)
 |-- price_high: double (nullable = true)
 |-- price_low: double (nullable = true)
 |-- price_close: double (nullable = true)
 |-- volume_btc: double (nullable = true)
 |-- tx_count: long (nullable = true)
 |-- tx_volume_btc: double (nullable = true)
 |-- avg_inputs: double (nullable = true)
 |-- avg_outputs: double (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

+-------------------+--------------+----------+----------+---------+-----------+----------+--------+-------------+----------+-----------+
|      timestamp_utc|unix_timestamp|price_open|price_high|price_low|price_close|volume_btc|tx_count|tx_volume_btc|avg_inputs|avg_outputs|
+-------------------+--------------+----------+----------+---------+-----------+----------+--------+-------------+----------+-----------+
|2017-11-11 17:11:00|  1.51041666E9|   6611.91|   6611.92|   6589.2|     6589.2|2.49028453|       0|          0.0|       0.0|        0.0|
|2017-11-11 17:31:00|  1.51041786E9|   6568.93|   6582.41|  6568.93|    6582.41|0.10011463|       0|          0.0|       0.0|        0.0|
|2017-11-11 20:06:00|  1.51042716E9|    6563.0|   6564.03|   6540.0|     6540.0|5.27926227|       0|          0.0|       0.0|        0.0|
|2017-11-11 20:52:00|  1.51042992E9|    6454.0|   6484.83|   6454.0|    6467.43|8.90806335|       0|          0.0|       0.0|        0.0|
|2017-11-12 08:27:00|  1.51047162E

                                                                                

## Prediction Target

In [4]:
# Define the prediction horizon and threshold
prediction_horizon = 10  # in minutes
price_increase_threshold = 0.001  # 0.1%

# Use a Window function to get the future price
window_spec = Window.orderBy("timestamp_utc")
df_with_future_price = df_ml.withColumn( # <-- Using df_ml now
    "future_price",
    lead(col("price_close"), prediction_horizon).over(window_spec)
)

# label
df_labeled = df_with_future_price.withColumn(
    "label",
    when(col("future_price") > col("price_close") * (1 + price_increase_threshold), 1)
    .otherwise(0)
)

# Removing rows where we couldn't calculate the label
df_final_ml = df_labeled.na.drop(subset=["future_price"])

print(f" Final ML-ready data with label :")
df_final_ml.select("timestamp_utc", "price_close", "future_price", "label", "tx_count").sort("timestamp_utc", ascending=False).show(15)
print("\n Label Distribution :")
df_final_ml.groupBy("label").count().show()

 Final ML-ready data with label :


25/11/15 11:01:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

+-------------------+-----------+------------+-----+--------+
|      timestamp_utc|price_close|future_price|label|tx_count|
+-------------------+-----------+------------+-----+--------+
|2025-10-14 01:46:00|   115109.0|    115242.0|    1|       0|
|2025-10-14 01:45:00|   115134.0|    115217.0|    0|       0|
|2025-10-14 01:44:00|   115282.0|    115221.0|    0|       0|
|2025-10-14 01:43:00|   115293.0|    115218.0|    0|       0|
|2025-10-14 01:42:00|   115265.0|    115232.0|    0|       0|
|2025-10-14 01:41:00|   115288.0|    115213.0|    0|       0|
|2025-10-14 01:40:00|   115437.0|    115211.0|    0|       0|
|2025-10-14 01:39:00|   115463.0|    115174.0|    0|       0|
|2025-10-14 01:38:00|   115458.0|    115159.0|    0|       0|
|2025-10-14 01:37:00|   115427.0|    115107.0|    0|       0|
|2025-10-14 01:36:00|   115437.0|    115109.0|    0|       0|
|2025-10-14 01:35:00|   115434.0|    115134.0|    0|       0|
|2025-10-14 01:34:00|   115484.0|    115282.0|    0|       0|
|2025-10

25/11/15 11:01:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 7:>                                                          (0 + 1) / 1]

+-----+-------+
|label|  count|
+-----+-------+
|    0|5334622|
|    1|1914004|
+-----+-------+



                                                                                

## Feature engineering

In [5]:
rolling_window_1h = Window.orderBy("timestamp_utc").rowsBetween(-60, 0)

df_with_features = df_final_ml.withColumn(
    # Price momentum 
    "price_1h_avg", avg("price_close").over(rolling_window_1h)
).withColumn(
    "price_momentum", col("price_close") / col("price_1h_avg") 
).withColumn(
    # On-chain activity momentum
    "tx_volume_btc_1h_avg", avg("tx_volume_btc").over(rolling_window_1h)
).withColumn(
    "tx_count_1h_max", max("tx_count").over(rolling_window_1h) # Spike detection
)

df_with_features = df_with_features.na.drop(subset=[
    "price_momentum",
    "tx_volume_btc_1h_avg",
    "tx_count_1h_max"
])
df_with_features.select(
    "timestamp_utc",
    "price_close",
    "price_momentum",
    "tx_volume_btc",
    "tx_volume_btc_1h_avg",
    "tx_count_1h_max"
).sort("timestamp_utc", ascending=False).show()

25/11/15 11:01:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 1

+-------------------+-----------+------------------+-------------+--------------------+---------------+
|      timestamp_utc|price_close|    price_momentum|tx_volume_btc|tx_volume_btc_1h_avg|tx_count_1h_max|
+-------------------+-----------+------------------+-------------+--------------------+---------------+
|2025-10-14 01:46:00|   115109.0|0.9966745572830502|          0.0|                 0.0|              0|
|2025-10-14 01:45:00|   115134.0|0.9968156057459893|          0.0|                 0.0|              0|
|2025-10-14 01:44:00|   115282.0|0.9980214707535741|          0.0|                 0.0|              0|
|2025-10-14 01:43:00|   115293.0|0.9980652822831627|          0.0|                 0.0|              0|
|2025-10-14 01:42:00|   115265.0|0.9977778641839217|          0.0|                 0.0|              0|
|2025-10-14 01:41:00|   115288.0|0.9979225815386746|          0.0|                 0.0|              0|
|2025-10-14 01:40:00|   115437.0|0.9991573012982016|          0.

                                                                                

## Build and Evaluate the Machine Learning Pipeline

In [6]:
# Select the features to be used by the model.
feature_cols = [
    "price_close",
    "tx_count",
    "tx_volume_btc",
    "price_momentum",
    "tx_volume_btc_1h_avg",
    "tx_count_1h_max"          
]

# 2. Split the data for training and test
(training_data, test_data) = df_with_features.randomSplit([0.8, 0.2], seed=42)
print("Data split into training and testing sets :")
print(f" - Training set count: {training_data.count():,}")
print(f" - Test set count: {test_data.count():,}")

# we use VectorAssembler to combines our feature columns into a single vector column
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# Models (need to change bcs rn it's kinda bad).
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="label"
)

pipeline = Pipeline(stages=[assembler, gbt])

# training
print("\nTraining the Logistic Regression model...")
model = pipeline.fit(training_data)
print("Training complete.")

# prediction
print("\nMaking predictions on the test set...")
predictions = model.transform(test_data)

# Sample of prediction
predictions.select("timestamp_utc", "price_close", "label", "prediction", "probability").show()

# Evaluate the model suing ROC
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"\nModel Performance on Test Set :")
print(f"Area Under ROC Curve (AUC) = {auc:.4f}")

Data split into training and testing sets :


25/11/15 11:01:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:01:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 1

 - Training set count: 5,798,902


25/11/15 11:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:03:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

 - Test set count: 1,449,724

Training the Logistic Regression model...


25/11/15 11:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:04:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 1

Training complete.

Making predictions on the test set...


25/11/15 11:10:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:10:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:10:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:10:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:10:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:10:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 1

+-------------------+-----------+-----+----------+--------------------+
|      timestamp_utc|price_close|label|prediction|         probability|
+-------------------+-----------+-----+----------+--------------------+
|2012-01-01 11:03:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:07:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:09:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:14:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:20:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:24:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:30:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:36:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:46:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:47:00|       4.58|    0|       0.0|[0.94031990819615...|
|2012-01-01 11:48:00|       4.58|    0|       0.0|[0.94031990819

25/11/15 11:11:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:11:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:11:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:11:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 331:>                                                        (0 + 1) / 1]


Model Performance on Test Set :
Area Under ROC Curve (AUC) = 0.6529


                                                                                

## Log and finals metrics : 

In [7]:
import csv
from datetime import datetime
import os

metrics_file = "../project_metrics_log.csv"
run_id = "baseline_logistic_regression_run_01"
timestamp = datetime.now().isoformat()

metrics_to_log = [
    {"run_id": run_id, "stage": "evaluation", "metric": "auc", "value": auc, "timestamp": timestamp},
    {"run_id": run_id, "stage": "data_split", "metric": "training_set_size", "value": training_data.count(), "timestamp": timestamp},
    {"run_id": run_id, "stage": "data_split", "metric": "test_set_size", "value": test_data.count(), "timestamp": timestamp}
]

file_exists = os.path.isfile(metrics_file)
with open(metrics_file, 'a', newline='') as csvfile:
    fieldnames = ["run_id", "stage", "metric", "value", "timestamp"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    if not file_exists:
        writer.writeheader()  
    for row in metrics_to_log:
        writer.writerow(row)

print(f"\n Metrics successfully logged to {metrics_file}")

25/11/15 11:12:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:12:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:12:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:12:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:12:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 11:12:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/15 1


 Metrics successfully logged to ../project_metrics_log.csv


                                                                                