In [17]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json

scala_version = '2.12'  # your scala version
spark_version = '3.5.0' # your spark version
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:2.8.0' #your kafka version
]
spark = SparkSession.builder.master("local").appName("kafka-example").config("spark.jars.packages", ",".join(packages)).getOrCreate()
spark

In [18]:
packages

['org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0',
 'org.apache.kafka:kafka-clients:2.8.0']

In [19]:
topic_name = 'text3'
kafka_server = 'localhost:9092'

kafkaDf = spark.read.format("kafka").option("kafka.bootstrap.servers", kafka_server).option("subscribe", topic_name).option("startingOffsets", "earliest").load()

In [20]:
kafkaDf.show()

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|NULL|[31 39 39 39 2D 3...|text3|        0|     0|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     1|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     2|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     3|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     4|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     5|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     6|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     7|2024-01-20 23:11:...|            0|
|NULL|[31 39 39 39 2D 3...|text3|        0|     8|2024

In [21]:
kafkaDf.count()

1570

In [22]:
from pyspark.ml.regression import LinearRegression
import numpy as np
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from time import sleep
from IPython.display import display, clear_output
from pyspark.sql import functions as F
from pyspark.sql.functions import from_json, split, current_date, year, col, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

In [23]:
trainDF = spark.read.csv("./data/train.csv", header=True)

In [24]:
trainDF.show()

+----------+------------------+------------------+------------------+------------------+------------------+---------+
|      Date|              Open|              High|               Low|             Close|         Adj Close|   Volume|
+----------+------------------+------------------+------------------+------------------+------------------+---------+
|2000-12-21|1.3098959922790527|1.4895830154418943|1.1458330154418943|1.1614580154418943| 1.065457820892334| 84525600|
|2008-03-25|  5.03000020980835| 5.175000190734863| 4.982500076293945| 5.079999923706055| 4.660114288330078| 73538800|
|2018-03-08| 60.73749923706055| 60.73749923706055| 59.84749984741211| 60.29499816894531|      59.697265625| 41191200|
|2019-02-21| 39.76499938964844| 40.01250076293945| 38.79499816894531|38.942501068115234| 38.64001083374024| 44854800|
|2006-07-07|3.3550000190734863|3.3616669178009038| 3.191667079925537| 3.228332996368408|2.9614956378936768| 54123000|
|2004-04-12| 2.245832920074463|  2.25583291053772| 2.173

In [25]:
trainDF.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [26]:
trainDF = trainDF \
    .withColumn("Date", col("Date").cast("date")) \
    .withColumn("Open", col("Open").cast(DoubleType())) \
    .withColumn("High", col("High").cast(DoubleType())) \
    .withColumn("Low", col("Low").cast(DoubleType())) \
    .withColumn("Close", col("Close").cast(DoubleType())) \
    .withColumn("Adj Close", col("Adj Close").cast(DoubleType())) \
    .withColumn("Volume", col("Volume").cast(IntegerType()))

In [27]:
trainDF.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [28]:
# Tạo một đối tượng VectorAssembler với các cột đầu vào Open, High, Low, Volume và cột đầu ra Features
featureAssembler = VectorAssembler(inputCols=["Open", "High", "Low", "Volume"], outputCol="Features")

# Áp dụng VectorAssembler để tạo cột "Features" trong dữ liệu đầu vào
output = featureAssembler.transform(trainDF)

# Tạo một đối tượng MinMaxScaler với cột đầu vào là "Features" và cột đầu ra là "ScaledFeatures", đặt phạm vi chuẩn hóa là [-1;1]
scaler = MinMaxScaler(inputCol="Features", outputCol="ScaledFeatures", min=-1, max=1)

# Fit dữ liệu vào mô hình scaler để tính toán các tham số chuẩn hóa
scalerModel = scaler.fit(output)

# Áp dụng mô hình scaler trên vào dữ liệu đầu vào "output"
scaledOutput = scalerModel.transform(output)

#Tạo dữ liệu để train model gồm 3 cột "Date", "ScaledFeatures" và "Close"
finalData = scaledOutput.select("Date", "ScaledFeatures", "Close")

LR = LinearRegression(featuresCol='ScaledFeatures', labelCol='Close')
LRModel = LR.fit(finalData)

In [32]:
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        df = kafkaDf.selectExpr("CAST(value AS STRING)").select(split("value", ",").alias("csv_values")) \
                        .selectExpr("csv_values[0] as Date", "csv_values[1] as Open", \
                                    "csv_values[2] as High", "csv_values[3] as Low", \
                                    "csv_values[4] as Adj_close", "csv_values[5] as Volume")

        df1 = df.alias("copied")
        df1 = df1 \
            .withColumn("Date", col("Date").cast("date")) \
            .withColumn("Open", col("Open").cast(DoubleType())) \
            .withColumn("High", col("High").cast(DoubleType())) \
            .withColumn("Low", col("Low").cast(DoubleType())) \
            .withColumn("Adj_close", col("Adj_close").cast(DoubleType())) \
            .withColumn("Volume", col("Volume").cast(IntegerType()))

        # Áp dụng VectorAssembler để tạo cột "Features" trong dữ liệu đầu vào
        output_test = featureAssembler.transform(df1)

        # Áp dụng mô hình scaler trên vào dữ liệu đầu vào "output"
        scaledOutput_test = scalerModel.transform(output_test)

        #Tạo dữ liệu để train model gồm 3 cột "Date", "ScaledFeatures" và "Close"
        finalData_test = scaledOutput_test.select("Date", "ScaledFeatures")

        LRPredictions = LRModel.transform(finalData_test)
        result = LRPredictions.select("Date","prediction")
        result = result.withColumnRenamed("prediction", "Close")

        result.show()
        sleep(5)
        clear_output(wait=True)

    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 10
+----------+-------------------+
|      Date|              Close|
+----------+-------------------+
|1999-01-22| 0.4379495649727403|
|1999-01-25|0.43024622753577546|
|1999-02-02|0.37691723732868354|
|1999-02-04| 0.4052494454867315|
|1999-02-17|0.41638425459424866|
|1999-02-18|  0.418623667698256|
|1999-02-25| 0.4934644793627285|
|1999-03-02| 0.4545102584355618|
|1999-03-12|  0.422125571173126|
|1999-03-23| 0.3985540484536614|
|1999-04-01| 0.4272994656195124|
|1999-04-05| 0.4244203235649593|
|1999-04-21| 0.3951216926840857|
|1999-04-22| 0.3795331779069784|
|1999-04-27| 0.3730052702600233|
|1999-04-28| 0.3684417164308229|
|1999-05-07| 0.3673155497137941|
|1999-05-10|0.37344423599279253|
|1999-06-02|0.34831890875523186|
|1999-06-09|0.38337079394446505|
+----------+-------------------+
only showing top 20 rows

break
Live view ended...
