In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Read the text file
text_df = spark.read.text("text.txt")

# Split the lines into words
words_df = text_df.select(\
    explode(split(col("value"), " ")).alias("word"))

# Count the occurrences of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Stop the SparkSession
spark.stop()
# In summary:

# Total Jobs: One job is triggered during the creation of SparkSession.

# Total Stages: At least three stages are created: one for reading the text file, 
# one for splitting lines into words, and one for the word count operation.

# Total Tasks: The exact number of tasks depends on the number of partitions 
# in the data and the size of the DataFrames after each transformation. Spark dynamically determines the number of tasks.

+-----------+-----+
|       word|count|
+-----------+-----+
|        Sed|    1|
|     cillum|    1|
|      velit|    1|
|     fugiat|    1|
|        non|    1|
|    eiusmod|    1|
|         ex|    1|
|    nostrud|    1|
|        sit|    1|
|consectetur|    1|
|    laboris|    1|
|       Duis|    1|
|   occaecat|    1|
|         in|    3|
|       aute|    1|
|       enim|    1|
|  pariatur.|    1|
|    commodo|    1|
|       sunt|    1|
|        qui|    1|
+-----------+-----+
only showing top 20 rows



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# \w: Matches any word character (equivalent to [a-zA-Z0-9_]).
# \s: Matches any whitespace character (spaces, tabs, newlines).
# ^: When used inside square brackets, it negates the character class. So,
# [^\w\s] matches any character that is not a word character or a whitespace character.

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")

# Stop the Spark session
spark.stop()


lorem: 2
ipsum: 2
dolor: 2
adipiscing: 2
sit: 2
consectetur: 2
amet: 2
elit: 2
dolore: 1
incididunt: 1
ut: 1
magna: 1
aliqua: 1
eiusmod: 1
et: 1
labore: 1
sed: 1
do: 1
tempor: 1


In [2]:
# Sample text (lorem ipsum)
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("WordFrequency").getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# \w: Matches any word character (equivalent to [a-zA-Z0-9_]).
# \s: Matches any whitespace character (spaces, tabs, newlines).
# ^: When used inside square brackets, it negates the character class. So,
# [^\w\s] matches any character that is not a word character or a whitespace character.

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")

# Stop the Spark session
spark.stop()


# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# \w: Matches any word character (equivalent to [a-zA-Z0-9_]).
# \s: Matches any whitespace character (spaces, tabs, newlines).
# ^: When used inside square brackets, it negates the character class. So,
# [^\w\s] matches any character that is not a word character or a whitespace character.

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")

# Stop the Spark session
spark.stop()


24/09/23 12:24:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

lorem: 2
ipsum: 2
dolor: 2
adipiscing: 2
sit: 2
consectetur: 2
amet: 2
elit: 2
dolore: 1
incididunt: 1
ut: 1
magna: 1
aliqua: 1
eiusmod: 1
et: 1
labore: 1
sed: 1
do: 1
tempor: 1


In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode

# Initialize Spark session
spark = SparkSession.builder.appName("WordFrequency").getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create DataFrame with a specified column name
text_df = spark.createDataFrame([(line,) for line in text_data], ["value"])

# Replace multiple delimiters (',', ':', and '.') with a single space
cleaned_text_df = text_df.withColumn("cleaned_text", regexp_replace(col("value"), "[,:.]", " "))

# Split the lines into words
words_df = cleaned_text_df.select(explode(split(col("cleaned_text"), "\s+")).alias("word"))

# Count the occurrences of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Stop the SparkSession
spark.stop()

+-----------+-----+
|       word|count|
+-----------+-----+
|        sit|    2|
|consectetur|    2|
|      dolor|    2|
|      Lorem|    2|
|      ipsum|    2|
|           |    3|
|       elit|    2|
|       amet|    2|
| adipiscing|    2|
|        Sed|    1|
|    eiusmod|    1|
|     aliqua|    1|
|      magna|    1|
| incididunt|    1|
|         do|    1|
|     labore|    1|
|     dolore|    1|
|     tempor|    1|
|         et|    1|
|         ut|    1|
+-----------+-----+

