In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType

spark = SparkSession.builder.appName("my_spark").getOrCreate()

#функция, чтобы выводить на консоль, вместо show()
def console_output(df, freq):
    return df.writeStream.format("console") \
        .trigger(processingTime=f'{freq} seconds') \
        .options(truncate=False) \
        .start()

######RATE SOURCE
raw_rate = spark \
    .readStream \
    .format("rate") \
    .load()


In [2]:
raw_rate.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [5]:
out = console_output(raw_rate, 5)

22/12/02 13:49:15 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-487e09a9-2146-4b3a-b3d2-2260e48945fb. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 13:49:15 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2022-12-02 13:49:15.762|0    |
|2022-12-02 13:49:17.762|2    |
|2022-12-02 13:49:16.762|1    |
|2022-12-02 13:49:18.762|3    |
+-----------------------+-----+



In [6]:
out.stop()

In [7]:
#добавляем собственный фильтр
filtered_rate = raw_rate \
    .filter( F.col("value")%F.lit(2)==0 )

out = console_output(filtered_rate, 5)

22/12/02 13:51:08 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4382a165-6345-4f38-be38-ede37825e2aa. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 13:51:08 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------+-----+
|timestamp              |value|
+-----------------------+-----+
|2022-12-02 13:51:08.851|0    |
+-----------------------+-----+

-------------------------------------------

In [9]:
out.stop()

In [10]:
#добавляем собственные колонки
extra_rate = filtered_rate \
    .withColumn("my_value",
                F.when((F.col("value") % F.lit(10) == 0), F.lit("jubilee"))
                    .otherwise(F.lit("not yet")))

out = console_output(extra_rate, 5)

22/12/02 13:51:36 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-25ea16ce-c94a-4a1e-8113-caad192b9608. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 13:51:36 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+--------+
|timestamp|value|my_value|
+---------+-----+--------+
+---------+-----+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------+-----+--------+
|timestamp             |value|my_value|
+----------------------+-----+--------+
|2022-12-02 13:51:36.47|0    |jubilee |
|2022-12

In [11]:
out.stop()

In [32]:
#если потеряем стрим из переменной, сможем остановить все наши стримы, получих их из спарк окружения
def killAll():
    for s in spark.streams.active:
        print(f"Stopping {s} for killAll")
        s.stop()
        
killAll()

In [43]:
######FILE SOURCE
schema = StructType() \
    .add("column_1", StringType()) \
    .add("column_2", StringType())

# В options-path должна быть указана директория, а не файл!
raw_files = spark \
    .readStream \
    .format("csv") \
    .schema(schema) \
    .options(path="input_csv_for_stream", header=True) \
    .load()

In [44]:
out = console_output(raw_files, 5)

22/12/02 14:03:12 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5b14b7e6-00a2-4518-b1e2-0d6877be164a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 14:03:12 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+
|column_1|column_2|
+--------+--------+
|a       |2       |
|b       |4       |
|c       |8       |
+--------+--------+



In [45]:
out.stop()

In [46]:
#по одному
raw_files = spark \
    .readStream \
    .format("csv") \
    .schema(schema) \
    .options(path="input_csv_for_stream", header=True, maxFilesPerTrigger=1) \
    .load()

out = console_output(raw_files, 5)

22/12/02 14:03:47 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-bcddbd05-286c-4831-b875-5329cef50acb. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 14:03:47 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+
|column_1|column_2|
+--------+--------+
|a       |2       |
|b       |4       |
|c       |8       |
+--------+--------+



In [47]:
out.stop()

In [55]:
# добавляем свою колонку
extra_files = raw_files \
    .withColumn("column_3", F.length(F.col("column_2"))) \
    .withColumn("column_4", F.exp("column_2"))

out = console_output(extra_files, 5)

22/12/02 14:08:37 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c30fb624-8c19-423f-b188-ff26911ea648. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/12/02 14:08:37 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+--------+--------+--------+------------------+
|column_1|column_2|column_3|column_4          |
+--------+--------+--------+------------------+
|a       |2       |1       |7.38905609893065  |
|b       |4       |1       |54.598150033144236|
|c       |8       |1       |2980.9579870417283|
+--------+--------+--------+------------------+



In [56]:
out.stop()