In [None]:
# Setup Spark SQL
# Note if running locally you need the JVM https://www.oracle.com/java/technologies/downloads/ 
# Consider running in https://colab.research.google.com/
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
import os
import re
from pyspark.sql import Row

# Ensure JAVA_HOME is set correctly
java_home = os.environ.get('JAVA_HOME')
if not java_home:
    raise EnvironmentError("JAVA_HOME environment variable is not set.")

# Configure Spark session
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel('DEBUG')

In [None]:
# Download the bible as txt
!curl -L "https://www.gutenberg.org/cache/epub/10/pg10.txt" > bible.txt

In [None]:
# Use SparkSQL to list all the words in the bible with their counts sorted descending by count.
# Turn in your code and sample of your results (at least top 20). 
# Be sure to ignore case and punctuation and eliminate the blank lines.
def putBookIntoWordsTable(bookFileTxt, tableName):
    lines = sc.textFile(bookFileTxt)
    linesLower = lines.map(lambda line: line.lower())
    words = linesLower.flatMap(lambda lowerLine: re.split('[^a-z]', lowerLine)).filter(lambda w: w != "")
    
    if words.isEmpty():
        raise ValueError("The words RDD is empty.")
    
    asRows = words.map(lambda w: Row(word=w))
    asDF = asRows.toDF()
    asDF.createOrReplaceTempView(tableName)

# Validate file path
bookFileTxt = "bible.txt"
if not os.path.exists(bookFileTxt):
    raise FileNotFoundError(f"The file {bookFileTxt} does not exist.")

# Execute the function
putBookIntoWordsTable(bookFileTxt, "bible")

# Run SQL query
result = spark.sql("SELECT word, COUNT(*) as count FROM bible WHERE word != '' GROUP BY word ORDER BY count DESC").show(20)

In [None]:
# What to try something else? 