In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

# read from socket and write to console

In [3]:
df = spark.readStream.format('socket')\
.option('host', 'localhost')\
.option('port', 12345)\
.load()

In [4]:
writer = df.writeStream.outputMode('append')\
.format('console')

In [5]:
query = writer.start()
query.awaitTermination(timeout=10)

False

In [6]:
query.stop()

In [8]:
df_lines = spark.readStream.format('socket')\
.option('host', 'localhost')\
.option('port', 12345)\
.load()

In [9]:
from pyspark.sql.functions import explode, split, col, count 

In [12]:
df_lines_split = df_lines.select(split(col('value'), ' ').alias('Lines_Split'))

In [13]:
df_words = df_lines_split.select(explode(col('Lines_Split')).alias('Words'))

In [14]:
df_WordCounts = df_words.groupBy('Words').agg(count('Words')).alias('WordCounts')

In [15]:
writer = df_WordCounts.writeStream.format('console')\
.outputMode('complete')\
.option('checkpointLocation', 'chkpoint_1')\
.trigger(processingTime = '1 second')

In [16]:
query = writer.start()
query.awaitTermination(timeout=10)

False

In [17]:
query.stop()

# read from text file and write to console

In [43]:
df = spark.readStream.format('text')\
.load('/Users/Ahmed Eltabakh/Spark-Course/data/text')

In [44]:
writer = df.writeStream.outputMode('append')\
.format('console')\
.option('truncate', False)\
.option('numRows', 10)

In [45]:
query = writer.start()
query.awaitTermination(timeout=10)

False

In [46]:
query.stop()

# read from csv and write to console

In [49]:
from pyspark.sql.types import *

In [50]:
#Define Schema
schema = StructType([
      StructField("Date", StringType(), True),
      StructField("Open", DoubleType(), True),
      StructField("High", DoubleType(), True),
      StructField("Low", DoubleType(), True),
      StructField("Close", DoubleType(), True),
      StructField("Adjusted Close", DoubleType(), True),
      StructField("Volume", DoubleType(), True)
    ])


In [53]:
df = spark.readStream.format("csv")\
.schema(schema)\
.load("/Users/Ahmed Eltabakh/Spark-Course/data/csv")

In [54]:
writer = df.writeStream.outputMode('append')\
.format('console')\
.option('truncate', False)\
.option('numRows', 10)

In [55]:
query = writer.start()
query.awaitTermination(timeout=10)

False

In [56]:
query.stop()

# reading from socket and writing to dir

In [58]:
df = spark.readStream.format('socket')\
.option('host', 'localhost')\
.option('port', 12345)\
.load()

In [59]:
writer = df.writeStream.outputMode('append')\
.format('text')\
.option('path', "/Users/Ahmed Eltabakh/Spark-Course/data/out")\
.option('checkpointLocation', 'chkpnt_4')

In [60]:
query = writer.start()
query.awaitTermination(timeout=10)

False

# read from file and write to file

In [91]:
from pyspark.sql.types import *

In [92]:
#Define Schema
schema = StructType([
      StructField("Date", StringType(), True),
      StructField("Open", DoubleType(), True),
      StructField("High", DoubleType(), True),
      StructField("Low", DoubleType(), True),
      StructField("Close", DoubleType(), True),
      StructField("Adjusted Close", DoubleType(), True),
      StructField("Volume", DoubleType(), True)
    ])


In [93]:
df = spark.readStream.format("csv")\
.schema(schema)\
.load("/Users/Ahmed Eltabakh/Spark-Course/data/csv")

In [94]:
writer = df.writeStream.outputMode('append')\
.format('csv')\
.option('path', "/Users/Ahmed Eltabakh/Spark-Course/data/out")\
.option('checkpointLocation', 'chkpnt_5')

In [95]:
query = writer.start()
query.awaitTermination(timeout=10)

False

In [90]:
query.stop()