In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:

spark = SparkSession.builder \
    .appName("Spark Streaming") \
    .getOrCreate()

25/07/12 15:27:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
!pwd

/home/diea/Documents/JN_WorkSpace/Spark/Spark/Day6/Lab-6


### Create the schema of the streamed files (check the column names and types from the CSV files)

In [4]:
df_sample = spark.read.csv("./walmart/walmart_stock.csv", header=True, inferSchema=True)
df_sample.printSchema()


root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [5]:
schema = df_sample.schema


### Create the dataframe by reading the stream using format "csv" and the schema you created.

In [6]:
stream_df = spark.readStream.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("./walmart")


### Make sure the dataframe is streaming the files from the folder

In [7]:
print(stream_df.isStreaming) 

True


### Create a stream writer into memory and specify the query name "stock:

In [8]:
writeStream_memory = stream_df.writeStream \
    .format("memory") \
    .queryName("stock") \
    .option("checkpointLocation", "./checkpoints/stock") \
    .outputMode("append") \
    

### Start the write stream and make sure it works (read all columns from the table)

In [9]:
writeStream_memory.start()

25/07/12 15:28:12 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7aa4ccf5d340>

In [10]:
spark.sql("SELECT * FROM stock").show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000000004|
|2012-01-10|             59.43|59.709998999999996|             5

### Remove the first row from the data (hint: drop the rows where ALL values are null), then add a new column "diff", which is the difference between high and low columns

In [11]:
clean_df = stream_df.na.drop(how="all").withColumn("diff", col("High") - col("Low"))

### Create a new write stream using the new generated dataframe and call the generate table "modified_data"

In [12]:
writeStream_clean_memory = clean_df.writeStream \
    .format("memory") \
    .queryName("modified_data") \
    .option("checkpointLocation", "./checkpoints/modified_data") \
    .outputMode("append") \
   


In [13]:
writeStream_clean_memory.start()

25/07/12 15:28:24 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7aa496482450>

In [14]:
spark.sql("SELECT * FROM modified_data").show() 

+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|               diff|
+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996| 1.1900019999999998|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475| 0.8799969999999959|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|               1.25|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922| 0.5800020000000004|
|2012-01-09|         59.029999|         5

### Write the generated data into files instead of the memory. 

In [15]:

writeStream_clean_csv = clean_df.writeStream \
    .format("csv") \
    .option("path", "./walmart_write")\
    .option("checkpointLocation", "./checkpoints/walmart_write/checkpoint") \
    .outputMode("append") \
    .start()


25/07/12 15:28:31 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


### Stop the query. Now, try reading the generated files into a normal dataframe
- Create a schema and use it to read the data.
- Show the output.

In [16]:
writeStream_clean_csv.stop()

25/07/12 15:28:36 WARN DAGScheduler: Failed to cancel job group 1008aecf-15b9-48a8-81c7-4ff7b02ae3cb. Cannot find active jobs for it.
25/07/12 15:28:36 WARN DAGScheduler: Failed to cancel job group 1008aecf-15b9-48a8-81c7-4ff7b02ae3cb. Cannot find active jobs for it.


In [17]:
final_schema = schema.add("diff", DoubleType())

final_df = spark.read.format("csv") \
    .schema(final_schema) \
    .load("./walmart_write")
final_df.show()


+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|               diff|
+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996| 1.1900019999999998|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475| 0.8799969999999959|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|               1.25|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922| 0.5800020000000004|
|2012-01-09|         59.029999|         5

### Sort the dataframe based on the Date

In [18]:
final_df.orderBy("Date").show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|               diff|
+----------+------------------+------------------+------------------+------------------+--------+------------------+-------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996| 1.1900019999999998|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475| 0.8799969999999959|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|               1.25|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922| 0.5800020000000004|
|2012-01-09|         59.029999|         5