In [None]:
# Define the root path for the data
ROOT_PATH = "data"

In [10]:
# Imports
import os, re
import findspark
from pyspark import SparkConf, SparkContext

In [11]:
# Initialize findspark for easier ussage
findspark.init()

In [5]:
# Setup Spark
conf = SparkConf().setMaster("local[*]").setAppName("LongestWordFinder")
sc = SparkContext(conf=conf)

In [43]:
# Gather all language paths
language_paths = []
for os_entry in os.listdir(ROOT_PATH):
    # Concatenate the root path with the current entry
    language_path = os.path.join(ROOT_PATH, os_entry)

    # Check if the current entry is a directory
    if os.path.isdir(language_path):
        language_paths.append(language_path)
print(language_paths)

['data/Dutch', 'data/German', 'data/Russian', 'data/Italian', 'data/English', 'data/French', 'data/Spanish', 'data/Ukrainian']


In [9]:
# Function to find the longest word in an RDD
def find_longest_word(file_path) -> str:
    rdd = sc.textFile(file_path)
    longest_word = (
        rdd.flatMap(lambda t: re.findall(r"\w+", t))         # Tokenize words -> r in findall means raw string
        .map(lambda w: (w, len(w)))                          # Map: Pair each word with its length
        .reduce(lambda a, b: a if a[1] > b[1] else b)        # Reduce: Find the longest word
    )
    return longest_word

In [None]:
results = {}    # {language: longest_word, ...}
for language_path in language_paths:
    language = os.path.basename(language_path)                      # Extract language from path
    word = find_longest_word(os.path.join(language_path, "*.txt"))  # Find the longest word in txt files in the language path
    
    if language not in results or len(word) > len(results[language]):
        results[language] = word

In [None]:
# Sort results by the length of the longest word
sorted_results = dict(sorted(results.items(), key=lambda item: len(item[1]), reverse=True))

In [None]:
# Print the results starting with the longest word
for language, word in sorted_results.items():
    print(f"{language} - {word} - {len(word)}")