### Importing Libraries

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import datetime
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, monotonically_increasing_id, lit, date_add, explode
import numpy as np
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')


### Build Spark Session

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Netflix Stock Price Forecasting") \
    .getOrCreate()
data_url = r"C:\Users\ADMIN\Desktop\Stock-Price-Forecasting-System-Using-Apache-Spark\data\processed\data.csv"    
# Load the data
df = spark.read.csv(data_url, header=True, inferSchema=True)
df.show(5)

+----------+----------+----------+----------+----------+--------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+----------+
|      Date|      Open|      High|       Low|     Close|  Volume|       Price_Range|       Daily_Change|             MA_10|             MA_50|               RSI|          Upper_BB|          Lower_BB|          Stoch_Osc|    Target|
+----------+----------+----------+----------+----------+--------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+----------+
|2018-02-07|266.579987|272.450012|264.329987|264.559998| 8981500| 8.120024999999998|-2.0199890000000096| 261.5133313333333| 261.5133313333333|  90.8082236708388| 274.1299184650964|248.89674420157024|  71.96474097564031|250.100006|
|2018-02-08|267.079987|267.619995|     250.0|250.100006| 9306700|17.61999500

In [3]:
print(f"Rows: {df.count()}, Columns: {len(df.columns)}")

Rows: 1006, Columns: 15


In [4]:
features_col = ["Open", "High", "Low", "Volume", "Price_Range", "Daily_Change", 
                "MA_10", "MA_50", "RSI", "Upper_BB", "Lower_BB", "Stoch_Osc"]

In [5]:
# Split data chronologically (80% train, 20% test)
total_rows = df.count()
train_rows = int(total_rows * 0.8)
train_df = df.orderBy("Date").limit(train_rows)
test_df = df.orderBy("Date").exceptAll(train_df)

In [6]:
df = df.withColumn("Date", col("Date").cast("timestamp"))
df.show(5)

+-------------------+----------+----------+----------+----------+--------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+----------+
|               Date|      Open|      High|       Low|     Close|  Volume|       Price_Range|       Daily_Change|             MA_10|             MA_50|               RSI|          Upper_BB|          Lower_BB|          Stoch_Osc|    Target|
+-------------------+----------+----------+----------+----------+--------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+----------+
|2018-02-07 00:00:00|266.579987|272.450012|264.329987|264.559998| 8981500| 8.120024999999998|-2.0199890000000096| 261.5133313333333| 261.5133313333333|  90.8082236708388| 274.1299184650964|248.89674420157024|  71.96474097564031|250.100006|
|2018-02-08 00:00:00|267.079987|267.6199

In [7]:
print(f"Training set row count: {train_df.count()}")
print(f"Testing set row count: {test_df.count()}")

Training set row count: 804
Testing set row count: 202


In [8]:
# create pipeline to process data
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
assembler = VectorAssembler(inputCols=features_col, outputCol='features')
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
pipeline = Pipeline(stages=[assembler, scaler])

# transform data 
transformer = pipeline.fit(train_df)
train_set = transformer.transform(train_df).select("Date",'scaled_features', 'Close')
test_set = transformer.transform(test_df).select('Date','scaled_features', 'Close')

train_set.show(5)

+----------+--------------------+----------+
|      Date|     scaled_features|     Close|
+----------+--------------------+----------+
|2018-02-07|[0.09370782339566...|264.559998|
|2018-02-08|[0.09514242007289...|250.100006|
|2018-02-09|[0.05718304650801...|249.470001|
|2018-02-12|[0.05227670578752...|257.950012|
|2018-02-13|[0.06705308025494...|258.269989|
+----------+--------------------+----------+
only showing top 5 rows



In [9]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

linear_regressor = LinearRegression(
    featuresCol='scaled_features', 
    labelCol='Close')
evaluator = RegressionEvaluator(
    labelCol='Close', 
    predictionCol='prediction', 
    metricName='rmse')

paramGrid = ParamGridBuilder() \
    .addGrid(linear_regressor.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(linear_regressor.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(linear_regressor.maxIter, [10, 100, 200]) \
    .build()


In [10]:
crossval = CrossValidator(
    estimator=linear_regressor,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds = 3
)

# Fit the cross-validator to find the best moel
cv_model = crossval.fit(train_set)

# Get the best model from cross validation
best_model = cv_model.bestModel

print("Best Model Params:")
print("  Regularization Param (regParam):", best_model.getRegParam())
print("  ElasticNet Param (elasticNetParam):", best_model.getElasticNetParam())
print("  Maximum Iterations (maxIter):", best_model.getMaxIter())

Best Model Params:
  Regularization Param (regParam): 0.01
  ElasticNet Param (elasticNetParam): 0.0
  Maximum Iterations (maxIter): 10


In [11]:
prediction_test = best_model.transform(test_set)

# Evaluate the model
evaluator = RegressionEvaluator(
    labelCol='Close', 
    predictionCol='prediction', 
    metricName='rmse')

# Calculate metrics
mse = evaluator.evaluate(prediction_test, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(prediction_test, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(prediction_test, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(prediction_test, {evaluator.metricName: "r2"})

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:",r2)

MSE: 0.12652211632295657
RMSE: 0.35569947472966074
MAE: 0.26686455528768627
R2 Score: 0.9999738877856017


In [12]:
predictions = test_set.join(prediction_test.select("Date","prediction"), on="Date", how="left")
predictions.show(5)

+----------+--------------------+----------+-----------------+
|      Date|     scaled_features|     Close|       prediction|
+----------+--------------------+----------+-----------------+
|2021-07-21|[0.83840701019224...|513.630005|513.4410903003127|
|2021-08-05|[0.81258427000206...|524.890015|524.7955351594778|
|2021-10-21|[1.13324534798888...|653.159973| 653.110173022169|
|2021-06-23|[0.78776576470111...| 512.73999|513.0245687530501|
|2021-08-18|[0.82081884058341...|521.869995|522.1324075992263|
+----------+--------------------+----------+-----------------+
only showing top 5 rows



In [17]:
from sklearn.linear_model import LinearRegression
import joblib

sklearn_model = LinearRegression()
sklearn_model.coef_ = best_model.coefficients.toArray()
sklearn_model.intercept_ = best_model.intercept

model_path = "../model/linear_regressor_sklearn.pkl"
joblib.dump(sklearn_model, model_path)

print(f"Model saved at {model_path}")

Model saved at ../model/linear_regressor_sklearn.pkl
