# netflix stock price

In [429]:
import kagglehub
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, regexp_replace
from pyspark.sql.types import DateType, LongType, FloatType

In [430]:
spark = SparkSession.builder.appName("neflix")\
                            .config("spark.driver.memory", '8g')\
                            .config("spark.excutor.memory", '8g')\
                            .getOrCreate()

## Data load

In [431]:
path = kagglehub.dataset_download("elnazalikarami/netflix-stock-price")

print("Path to dataset files:", path)

Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/elnazalikarami/netflix-stock-price/versions/2


In [432]:
df = spark.read.csv(path, inferSchema=True, header=True)
df.toPandas()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,07/07/2025,1295.00,1295.00,1276.02,1289.62,2273737
1,07/03/2025,1292.00,1302.26,1279.76,1297.18,2006224
2,07/02/2025,1296.96,1299.95,1271.59,1284.86,3048801
3,07/01/2025,1338.23,1338.77,1282.22,1293.60,4993567
4,06/30/2025,1331.00,1341.15,1321.21,1339.13,3282908
...,...,...,...,...,...,...
245,07/12/2024,652.75,655.92,643.71,647.60,3355197
246,07/11/2024,672.49,672.49,647.44,652.75,5111928
247,07/10/2024,685.02,687.22,673.16,677.65,2651928
248,07/09/2024,690.00,695.27,684.72,685.74,2599474


In [433]:
df = df.withColumns({
    'Open': regexp_replace("Open", ",", ""),
    'High': regexp_replace("High", ",", ""),
    "Low": regexp_replace("Low", ",", ""),
    "Close": regexp_replace("Close", ",", ""),
    "Volume": regexp_replace("Volume", ",", ""),
})

df = df.withColumns({
    'Open': df.Open.cast(FloatType()), 'High': df.High.cast(FloatType()), "Low": df.Low.cast(FloatType()), "Close": df.Close.cast(FloatType()), "Volume": df.Volume.cast(FloatType())
})
print(df.toPandas)
df.toPandas()

<bound method PandasConversionMixin.toPandas of DataFrame[Date: string, Open: float, High: float, Low: float, Close: float, Volume: float]>


Unnamed: 0,Date,Open,High,Low,Close,Volume
0,07/07/2025,1295.000000,1295.000000,1276.020020,1289.619995,2273737.0
1,07/03/2025,1292.000000,1302.260010,1279.760010,1297.180054,2006224.0
2,07/02/2025,1296.959961,1299.949951,1271.589966,1284.859985,3048801.0
3,07/01/2025,1338.229980,1338.770020,1282.219971,1293.599976,4993567.0
4,06/30/2025,1331.000000,1341.150024,1321.209961,1339.130005,3282908.0
...,...,...,...,...,...,...
245,07/12/2024,652.750000,655.919983,643.710022,647.599976,3355197.0
246,07/11/2024,672.489990,672.489990,647.440002,652.750000,5111928.0
247,07/10/2024,685.020020,687.219971,673.159973,677.650024,2651928.0
248,07/09/2024,690.000000,695.270020,684.719971,685.739990,2599474.0


In [434]:
df = df.orderBy("Date").select("Open", "High", "Low", "Close", "Volume")
df.toPandas()

Unnamed: 0,Open,High,Low,Close,Volume
0,895.500000,898.580017,877.000000,886.729980,2315685.0
1,893.130005,898.830017,879.890015,881.049988,2970019.0
2,888.760010,892.830017,871.690002,881.789978,3457656.0
3,879.380005,888.000000,869.109985,879.190002,2649823.0
4,880.000000,886.219971,873.000000,875.000000,2347949.0
...,...,...,...,...,...
245,915.000000,935.849976,911.700012,932.119995,2320293.0
246,928.400024,930.489990,915.299988,924.140015,2342418.0
247,916.010010,918.130005,894.500000,907.549988,3226158.0
248,894.510010,908.229980,889.710022,900.429993,2202970.0


In [435]:
def learning(machine, data, param, isVassembler=False, Vparam=None):
    train_data, test_data = data.randomSplit([0.8, 0.2], seed=12)

    if isVassembler:
        vassembler = VectorAssembler(**Vparam)
        train_data = vassembler.transform(train_data)
        test_data = vassembler.transform(test_data)
    
    learning = machine(**param)
    model = learning.fit(train_data)
    
    predic = model.transform(test_data)
    
    print(f'rootMeanSquaredError: {model.summary.rootMeanSquaredError}')
    print(f"r2: {model.summary.r2}")
    predic.select("Close", "prediction").show()

In [436]:
Vparam = {"inputCols":["Open", "High", "Low", "Volume"], "outputCol":"features"}
param = {"featuresCol":'features', "labelCol":'Close'}

In [437]:
learning(LinearRegression, df, param, True, Vparam)

rootMeanSquaredError: 6.125910111503436
r2: 0.998982219497021
+------+-----------------+
| Close|       prediction|
+------+-----------------+
|609.57|617.0834259013324|
|624.85|634.0093619892782|
|647.46|641.5836716292373|
|663.22|665.2640902896344|
|652.75|652.7303909731767|
|683.62|683.6433057472439|
|697.06|695.7017292270027|
|695.72| 702.467458443786|
|692.48|694.8773718683144|
|697.12|695.9064805241514|
|705.37|707.3936327424489|
|711.09|713.2588204143095|
|707.35|704.7880848765475|
|706.13| 705.577748837945|
| 713.0|715.9107970699788|
| 756.1|757.2491007851567|
|763.91| 764.719367917434|
|749.12|750.3966430062403|
|749.29|749.3117739285494|
|764.24|765.3871790046844|
+------+-----------------+
only showing top 20 rows



In [441]:
spark.stop()