In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# \w: Matches any word character (equivalent to [a-zA-Z0-9_]).
# \s: Matches any whitespace character (spaces, tabs, newlines).
# ^: When used inside square brackets, it negates the character class. So,
# [^\w\s] matches any character that is not a word character or a whitespace character.

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")



lorem: 2
ipsum: 2
dolor: 2
adipiscing: 2
sit: 2
consectetur: 2
amet: 2
elit: 2
dolore: 1
incididunt: 1
ut: 1
magna: 1
aliqua: 1
eiusmod: 1
et: 1
labore: 1
sed: 1
do: 1
tempor: 1


24/11/28 07:50:12 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [17]:
# Sample text (lorem ipsum)
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("WordFrequency").getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create an RDD from the text data
rdd = spark.sparkContext.parallelize(text_data).repartition(2)

# Convert all words to lowercase and split each line into words
words_rdd = rdd.flatMap(lambda line: line.lower().split())

# Remove punctuation from each word
import re
words_cleaned_rdd = words_rdd.map(lambda word: re.sub(r'[^\w\s]', '', word))

# \w: Matches any word character (equivalent to [a-zA-Z0-9_]).
# \s: Matches any whitespace character (spaces, tabs, newlines).
# ^: When used inside square brackets, it negates the character class. So,
# [^\w\s] matches any character that is not a word character or a whitespace character.

# Create pairs (word, 1) using map
word_pairs = words_cleaned_rdd.map(lambda word: (word, 1))

# Reduce by key (word) to count frequency
word_counts = word_pairs.reduceByKey(lambda a, b: a + b).cache()

# Sort the result by word frequency
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)
# sorted_word_counts

# Collect and display the word counts
for word, count in sorted_word_counts.collect():
    print(f"{word}: {count}")



lorem: 2
dolor: 2
adipiscing: 2
ipsum: 2
sit: 2
amet: 2
consectetur: 2
elit: 2
sed: 1
eiusmod: 1
incididunt: 1
ut: 1
et: 1
dolore: 1
magna: 1
aliqua: 1
do: 1
tempor: 1
labore: 1


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode

# Initialize Spark session
spark = SparkSession.builder.appName("WordFrequency").config("spark.default.parallelism", "4").getOrCreate()

# Sample text (lorem ipsum)
text_data = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
             "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."]

# Create DataFrame with a specified column name
text_df = spark.createDataFrame([(line,) for line in text_data], ["value"])

# Replace multiple delimiters (',', ':', and '.') with a single space
cleaned_text_df = text_df.withColumn("cleaned_text", regexp_replace(col("value"), "[,:.]", " "))

# Split the lines into words
words_df = cleaned_text_df.select(explode(split(col("cleaned_text"), "\s+")).alias("word"))

# Count the occurrences of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
# word_counts.show()
# save the data into a csv file
word_counts.write.csv("word_counts_test")


24/12/20 15:24:43 WARN Utils: Your hostname, BMMDVADHIKRAMM.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
24/12/20 15:24:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/20 15:24:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 4) / 4]

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/Users/adhikram.m/personal/Study_Resources/Pyspark/resource/word_counts_test already exists. Set mode as "overwrite" to overwrite the existing path.

24/12/20 15:24:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [2]:
# Stop the SparkSession
spark.stop()