In [None]:
#! pip install spark 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spark
  Downloading spark-0.2.1.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/41.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: spark
  Building wheel for spark (setup.py) ... [?25l[?25hdone
  Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58760 sha256=7cd1fd8f887959b94acce44fbf2824fb439a37e8f8a1900738991ad6b94f158c
  Stored in directory: /root/.cache/pip/wheels/63/88/77/b4131110ea4094540f7b47c6d62a649807d7e94800da5eab0b
Successfully built spark
Installing collected packages: spark
Successfully installed spark-0.2.1


In [None]:
#%rm -rf /content/InputStream

In [None]:
#! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=d9ceae8b5481535eaaa36c6511a0fe16938801ba348cb7accf1d892aee5207fe
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
#! pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark 
import numpy 
import pandas
findspark.init()
import pyspark
from  pyspark.sql import SparkSession 
import pyspark.sql.functions as F # for aggregation funs || min ,max ...

In [None]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Create the schema of the streamed files (check the column names and types from the CSV files)

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType,IntegerType,DateType

my_schema = StructType([
    StructField('ID',DoubleType(), True),    
    StructField('Date',DateType(), True),
    StructField('Open', DoubleType(), True),
    StructField('High', DoubleType(), True),
    StructField('Low', DoubleType(), True),
    StructField('Close', DoubleType(), True),
    StructField('dj Close', DoubleType(), True),
    StructField('Volume', IntegerType(), True)

])

### Create the dataframe by reading the stream using format "csv" and the schema you created.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
! unzip -u "/content/InputStream.zip"

Archive:  /content/InputStream.zip
  inflating: InputStream/KOSPI_STOCK_0.csv  
  inflating: InputStream/KOSPI_STOCK_1.csv  
  inflating: InputStream/KOSPI_STOCK_2.csv  
  inflating: InputStream/KOSPI_STOCK_3.csv  


In [None]:
df = spark.readStream.format('csv').option("header", "true").option("inferSchema", "true").load('/content/InputStream/*.csv',schema=my_schema)

### Make sure the sataframe is streaming the files from the folder

In [None]:
df.printSchema()  

root
 |-- ID: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- dj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



### Create a stream writer into memory and specify the query name "stock:

In [None]:
writer = df.writeStream.outputMode("append") \
    .format("memory") \
    .option("queryname","stock")\
    .option("truncate", False) \
    .option("numRows", 200) 


In [None]:
query2 = writer.start()

In [None]:
query2.stop()

### Start the write stream and make sure it works (read all columns from the table)

In [None]:
spark.sql('SELECT * FROM stock ').show(0)

+---+----+----+----+---+-----+--------+------+
| ID|Date|Open|High|Low|Close|dj Close|Volume|
+---+----+----+----+---+-----+--------+------+
+---+----+----+----+---+-----+--------+------+
only showing top 0 rows



In [None]:
spark.sql('SELECT * FROM stock ').show(20)

+-----+----------+------------+------------+------------+------------+------------+------+
|   ID|      Date|        Open|        High|         Low|       Close|    dj Close|Volume|
+-----+----------+------------+------------+------------+------------+------------+------+
|120.0|2000-06-20|22817.900391|23102.199219|21680.599609|22320.300781|21092.632813| 34466|
|121.0|2000-06-21|21893.800781|22675.699219|21680.599609|22675.699219|21428.484375| 68651|
|122.0|2000-06-22|23386.599609|23386.599609|     22462.5|23031.099609|21764.335938| 97209|
|123.0|2000-06-23|22107.099609|24097.400391|22107.099609|     22889.0|21630.052734|199483|
|124.0|2000-06-26|23102.199219|     24168.5|22569.099609|24026.300781|22704.796875|121969|
|125.0|2000-06-27|24026.300781|25519.099609|     23742.0|24026.300781|22704.796875|113809|
|126.0|2000-06-28|23884.199219|24666.099609|23884.199219|24666.099609|23309.408203| 86236|
|127.0|2000-06-29|25234.699219|25234.699219|23919.699219|24239.599609|22906.365234| 45299|

### Remove the first row from the data (hint: drop the rows where ALL values are null), then add a new column "diff", which is the difference between high and low columns

In [None]:
#df_spark = spark.sql("select * from stock")

In [None]:
df_spark = df.dropna(how="all")

In [None]:
from pyspark.sql.functions import col
df_spark = df_spark.withColumn("diff", col("High") - col("LOW"))

In [None]:
df_spark.printSchema()

root
 |-- ID: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- dj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- diff: double (nullable = true)



In [None]:
#df_spark.select("high","low","diff").show(3)

### Create a new write stream using the new generated dataframe and call the generate table "modified_data"

In [None]:
writer2 = df_spark.writeStream.outputMode("append") \
    .format("memory") \
    .option("queryname","modified_data")\
    .option("truncate", False) \
    .option("numRows", 200) 


In [None]:
query_3 = writer2.start()

In [None]:
spark.sql("select * from modified_data").show(0)

+---+----+----+----+---+-----+--------+------+----+
| ID|Date|Open|High|Low|Close|dj Close|Volume|diff|
+---+----+----+----+---+-----+--------+------+----+
+---+----+----+----+---+-----+--------+------+----+
only showing top 0 rows



In [None]:
spark.sql("select * from modified_data").show(20)

+-----+----------+------------+------------+------------+------------+------------+------+------------------+
|   ID|      Date|        Open|        High|         Low|       Close|    dj Close|Volume|              diff|
+-----+----------+------------+------------+------------+------------+------------+------+------------------+
|120.0|2000-06-20|22817.900391|23102.199219|21680.599609|22320.300781|21092.632813| 34466|1421.5996099999975|
|121.0|2000-06-21|21893.800781|22675.699219|21680.599609|22675.699219|21428.484375| 68651| 995.0996099999975|
|122.0|2000-06-22|23386.599609|23386.599609|     22462.5|23031.099609|21764.335938| 97209| 924.0996090000008|
|123.0|2000-06-23|22107.099609|24097.400391|22107.099609|     22889.0|21630.052734|199483|1990.3007819999984|
|124.0|2000-06-26|23102.199219|     24168.5|22569.099609|24026.300781|22704.796875|121969|1599.4003909999992|
|125.0|2000-06-27|24026.300781|25519.099609|     23742.0|24026.300781|22704.796875|113809|1777.0996090000008|
|126.0|200

### Write the generated data into files instead of the memory. 

In [None]:
writer_folder = df_spark.writeStream.outputMode("append") \
    .format("csv") \
    .option("path","/content/outpath/")\
    .option("checkpointLocation", "chk1")\
    .option("header","true")


In [None]:
#%rm -rf /content/outpath

In [None]:
quer_folder = writer_folder.start()

### Stop the query. Now, try reading the generated files into a normal dataframe
- Create a schema and use it to read the data.
- Show the output.

In [None]:
quer_folder.stop()

In [None]:
data_from_folder = spark.read.format('csv').option("header", "true").option("inferSchema", "true").load('/content/outpath/*.csv')

In [None]:
data_from_folder.show(3)

+-----+----------+------------+------------+------------+------------+------------+------+------------------+
|   ID|      Date|        Open|        High|         Low|       Close|    dj Close|Volume|              diff|
+-----+----------+------------+------------+------------+------------+------------+------+------------------+
|120.0|2000-06-20|22817.900391|23102.199219|21680.599609|22320.300781|21092.632813| 34466|1421.5996099999975|
|121.0|2000-06-21|21893.800781|22675.699219|21680.599609|22675.699219|21428.484375| 68651| 995.0996099999975|
|122.0|2000-06-22|23386.599609|23386.599609|     22462.5|23031.099609|21764.335938| 97209| 924.0996090000008|
+-----+----------+------------+------------+------------+------------+------------+------+------------------+
only showing top 3 rows



In [None]:
data_from_folder.printSchema()

root
 |-- ID: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- dj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- diff: double (nullable = true)



### Sort the dataframe based on the ID

In [None]:
finalDFSorted = data_from_folder.sort('ID')
finalDFSorted.show()

+----+----------+------------+------------+------------+------------+------------+------+------------------+
|  ID|      Date|        Open|        High|         Low|       Close|    dj Close|Volume|              diff|
+----+----------+------------+------------+------------+------------+------------+------+------------------+
| 0.0|2000-01-04|22817.900391|25696.800781|22817.900391|24879.300781|23510.880859|108745|2878.9003900000025|
| 1.0|2000-01-05|24523.900391|26229.900391|23670.900391|24417.300781|23074.294922|175990|            2559.0|
| 2.0|2000-01-06|24381.699219|24666.099609|22746.800781|22817.900391|21562.865234| 71746| 1919.298827999999|
| 3.0|2000-01-07|     22036.0|24879.300781|     22036.0|23884.199219|22570.513672|120984|2843.3007810000017|
| 4.0|2000-01-10|24879.300781|25519.099609|23813.099609|24061.900391|22738.439453|151371|            1706.0|
| 5.0|2000-01-11|     24168.5|     25021.5|23955.199219|24239.599609|22906.365234| 95943|1066.3007810000017|
| 6.0|2000-01-12|  