In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json

scala_version = '2.12'  # your scala version
spark_version = '3.5.0' # your spark version
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:2.8.0' #your kafka version
]
spark = SparkSession.builder.master("local").appName("kafka-example").config("spark.jars.packages", ",".join(packages)).getOrCreate()
spark

In [2]:
packages

['org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0',
 'org.apache.kafka:kafka-clients:2.8.0']

Training model

In [3]:
#Khai báo thư viện
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import count, when, isnull, split, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from IPython.display import display, clear_output
from time import sleep
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
train = spark.read.csv("./data/NVDA/train.csv", header=True, inferSchema=True)

In [6]:
train.toPandas()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Close_after_30_days
0,1999-01-22,0.437500,0.488281,0.388021,0.410156,271468800,0.445313
1,1999-01-25,0.442708,0.458333,0.410156,0.453125,51048000,0.459635
2,1999-01-26,0.458333,0.467448,0.411458,0.417969,34320000,0.451823
3,1999-01-27,0.419271,0.429688,0.395833,0.416667,24436800,0.432292
4,1999-01-28,0.416667,0.419271,0.412760,0.415365,22752000,0.424479
...,...,...,...,...,...,...,...
6214,2023-10-03,448.079987,451.299988,432.459991,435.170013,47085000,496.559998
6215,2023-10-04,437.420013,441.429993,432.920013,440.410004,36182100,488.880005
6216,2023-10-05,440.500000,449.000000,438.880005,446.880005,39348300,494.799988
6217,2023-10-06,441.929993,457.890015,440.260010,457.619995,43339700,492.980011


In [7]:
df_train = train.select("Close", "Close_after_30_days")

In [8]:
df_train.toPandas()

Unnamed: 0,Close,Close_after_30_days
0,0.410156,0.445313
1,0.453125,0.459635
2,0.417969,0.451823
3,0.416667,0.432292
4,0.415365,0.424479
...,...,...
6214,435.170013,496.559998
6215,440.410004,488.880005
6216,446.880005,494.799988
6217,457.619995,492.980011


In [9]:
#Chuyển đổi cột close thành Vector
feature_col = ["Close"]
assembler = VectorAssembler(inputCols=feature_col, outputCol="Feature")
df_train = assembler.transform(df_train).select("Feature", "Close_after_30_days")

In [10]:
df_train.toPandas()

Unnamed: 0,Feature,Close_after_30_days
0,[0.4101560115814209],0.445313
1,[0.453125],0.459635
2,[0.4179689884185791],0.451823
3,[0.41666701436042786],0.432292
4,[0.4153650104999542],0.424479
...,...,...
6214,[435.1700134277344],496.559998
6215,[440.4100036621094],488.880005
6216,[446.8800048828125],494.799988
6217,[457.6199951171875],492.980011


In [11]:
LR = LinearRegression(featuresCol="Feature", labelCol="Close_after_30_days")

In [12]:
model = LR.fit(df_train)

Dự đoán

In [13]:
topic_name = 'NVDAstream'
kafka_server = 'localhost:9092'

kafkaDf = spark.readStream.format("kafka").option("kafka.bootstrap.servers", kafka_server).option("subscribe", topic_name).load()

In [14]:
df = kafkaDf.selectExpr("CAST(value AS STRING)").select(split("value", ",").alias("csv_values")) \
    .selectExpr("csv_values[4] as Close", "csv_values[6] as Close_after_30_days") 

In [15]:
df1 = df.alias("copied")
df1 = df1 \
    .withColumn("Close", col("Close").cast(DoubleType())) \
    .withColumn("Close_after_30_days", col("Close_after_30_days").cast(DoubleType()))

In [16]:
df1 = assembler.transform(df1).select("Feature", "Close_after_30_days")

In [17]:
predict = model.transform(df1)

In [18]:
from pyspark.sql.functions import to_json, struct

spark = SparkSession.builder \
    .appName("MySparkApplication") \
    .getOrCreate()

checkpoint_path = "./checkpoint_NVDA"

# Chuyển từng hàng của DataFrame thành chuỗi JSON
json_df = predict.select(to_json(struct("prediction")).alias("value"))

# Gửi dữ liệu lên Kafka
kafka_df = json_df.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "NVDA") \
    .option("checkpointLocation", checkpoint_path) \
    .start()

In [19]:
query = json_df.writeStream.format("memory").outputMode("append").queryName("streaming_query")
query2 = query.start()

In [20]:
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        result2 = spark.sql(f"SELECT * from {query2.name}")
        display(result2.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 185


Unnamed: 0,value
0,"{""prediction"":478.7638587324418}"
1,"{""prediction"":489.29090290870846}"
2,"{""prediction"":490.7425661133025}"
3,"{""prediction"":475.2443690507849}"
4,"{""prediction"":481.8655825125044}"
5,"{""prediction"":459.3389235376592}"
6,"{""prediction"":441.1463148878144}"
7,"{""prediction"":440.15420054918695}"
8,"{""prediction"":432.69751902640604}"
9,"{""prediction"":449.2818181940907}"


break
Live view ended...


In [21]:
query2.stop()