In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("WordFrequency").getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")

# Stop the Spark session
spark.stop()


[['lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.'], ['lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.'], ['sed', 'do', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua.']]
