# Model Training

In [32]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Kafka") \
    .master("spark://80d04dce9402:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

## Retrieve data from stream

In [33]:
# Path to the directory where Parquet files were written
parquet_path = "/home/jovyan/notebooks/data/streamdata/"

# Read the Parquet files into a DataFrame
parquet_df = spark.read.parquet(parquet_path)

                                                                                

In [34]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, when

# Create a window spec to get previous day's close
window_spec = Window.partitionBy("Symbol").orderBy("Date")

# Add previous day's close price
parquet_df = parquet_df.withColumn("prev_close", lag("Close").over(window_spec))

# Create binary label: 1 if price increased, else 0
classified_df = parquet_df.withColumn(
    "label", when(col("Close") > col("prev_close"), 1).otherwise(0)
).dropna()

# Filtrar para solo registros con símbolo AAPL
classified_df = classified_df.filter(col("Symbol") == "AAPL")

## Training for Apple

In [35]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

feature_cols = ["Open", "High", "Low", "Volume"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(classified_df).select("features", "label")

# Train/test split
train, test = data.randomSplit([0.8, 0.2], seed=42)

# Train classifier
classifier = RandomForestClassifier(featuresCol="features", labelCol="label")
model = classifier.fit(train)

# Predictions
predictions = model.transform(test)
predictions.write.mode("overwrite").parquet("/home/jovyan/notebooks/data/predictions/")



                                                                                

## Model Evalutation

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")
evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall")

accuracy = evaluator_accuracy.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


[Stage 50:>                                                         (0 + 1) / 1]

Accuracy: 0.5238095238095238
F1 Score: 0.5238095238095238
Precision: 0.5333987141767462
Recall: 0.5238095238095238


                                                                                

In [37]:
import csv
import os

# Define the output CSV file path
metrics_file = "/home/jovyan/notebooks/data/evaluation_metrics.csv"

# Ensure the folder exists
os.makedirs(os.path.dirname(metrics_file), exist_ok=True)

# Define the metrics to write
metrics = {
    "Accuracy": accuracy,
    "F1 Score": f1,
    "Precision": precision,
    "Recall": recall
}

# Write to CSV, overwriting every time
with open(metrics_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Metric", "Value"])
    for metric, value in metrics.items():
        writer.writerow([metric, value])

print(f"Metrics saved to {metrics_file}")


Metrics saved to /home/jovyan/notebooks/data/evaluation_metrics.csv


In [38]:
sc.stop()