In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Read the text file
text_df = spark.read.text("text.txt")

# Split the lines into words
words_df = text_df.select(explode(split(col("value"), " ")).alias("word"))

# Count the occurrences of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Stop the SparkSession
spark.stop()
# In summary:

# Total Jobs: One job is triggered during the creation of SparkSession.

# Total Stages: At least three stages are created: one for reading the text file, 
# one for splitting lines into words, and one for the word count operation.

# Total Tasks: The exact number of tasks depends on the number of partitions 
# in the data and the size of the DataFrames after each transformation. Spark dynamically determines the number of tasks.

+-----------+-----+
|       word|count|
+-----------+-----+
|        Sed|    1|
|     cillum|    1|
|      velit|    1|
|     fugiat|    1|
|        non|    1|
|    eiusmod|    1|
|         ex|    1|
|    nostrud|    1|
|        sit|    1|
|consectetur|    1|
|    laboris|    1|
|       Duis|    1|
|   occaecat|    1|
|         in|    3|
|       aute|    1|
|       enim|    1|
|  pariatur.|    1|
|    commodo|    1|
|       sunt|    1|
|        qui|    1|
+-----------+-----+
only showing top 20 rows



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Read the text file
text_df = spark.read.text("text.txt")

# Replace multiple delimiters (',', ':', and space) with a single space
cleaned_text_df = text_df.withColumn("cleaned_text", regexp_replace(col("value"), "[,:]", " "))

# Split the lines into words
words_df = cleaned_text_df.select(explode(split(col("cleaned_text"), "\s+")).alias("word"))

# Count the occurrences of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Stop the SparkSession
spark.stop()

+-----------+-----+
|       word|count|
+-----------+-----+
|        Sed|    1|
|     veniam|    1|
|     cillum|    1|
|      velit|    1|
|     fugiat|    1|
|        non|    1|
|    eiusmod|    1|
|         ex|    1|
|    nostrud|    1|
|        sit|    1|
|consectetur|    1|
|    laboris|    1|
|       Duis|    1|
|   occaecat|    1|
|         in|    3|
|       aute|    1|
|       enim|    1|
|  pariatur.|    1|
|    commodo|    1|
|       sunt|    1|
+-----------+-----+
only showing top 20 rows

