In [None]:
# import findspark
# findspark.init('/opt/spark')

In [None]:
from config import GOLD_FEATURES_PATH, MODEL_OUTPUT_PATH

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lead
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
spark = SparkSession.builder \
    .appName("mini_projet_model_training") \
    .master("local") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [None]:
gold_features_path = GOLD_FEATURES_PATH

df = spark.read.format("delta").load(gold_features_path)

window_spec = Window.partitionBy("symbol").orderBy("window.end")

df_labeled = df.withColumn(
    "next_price",
    lead("avg_price",1).over(window_spec)
)
df_labeled.show()

In [None]:
df_labeled = df_labeled.withColumn(
    "label",
    (col("next_price") > col("avg_price")).cast("integer")
)
df_labeled.show()

In [None]:
df_labeled.filter(df_labeled["volatility"].isNull()).show()
df_labeled = df_labeled.fillna({'volatility':4})
df_labeled.filter(df_labeled["volatility"].isNull()).show()

df_labeled.filter(df_labeled["next_price"].isNull()).show()
df_labeled = df_labeled.filter(df_labeled["next_price"].isNotNull())
df_labeled.filter(df_labeled["next_price"].isNull()).show()

In [None]:
features_cols = ["avg_price","avg_volume","volatility"]

vector_assembler = VectorAssembler(
    inputCols=features_cols,
    outputCol="features"
)

df_prepared = vector_assembler.transform(df_labeled)
df_prepared.show()

In [None]:
train_data, test_data = df_prepared.randomSplit([0.8,0.2], seed=42)
model = RandomForestClassifier(featuresCol="features",labelCol="label",numTrees=60)
model = model.fit(train_data)

In [None]:
predictions = model.transform(test_data)
predictions.select("features","label","probability","prediction").show(truncate=False)

In [None]:
model_evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="rawPrediction",
    labelCol='label',
    metricName="areaUnderROC"
)

roc_auc = model_evaluator.evaluate(predictions)

model_accuracy = predictions.filter(predictions["label"] == predictions["prediction"]).count() / predictions.count()

print(f"ROC AUC : {roc_auc}")
print(f"Accuracy : {model_accuracy}")

In [None]:
model_output_path = MODEL_OUTPUT_PATH

model.write().overwrite().save(model_output_path)