In [10]:
# Imports
import os, re, glob
import findspark
from pyspark import SparkConf, SparkContext

In [11]:
# Initialize findspark for easier ussage
findspark.init()

In [5]:
# Setup Spark
conf = SparkConf().setMaster("local[*]").setAppName("LongestWordFinder")
sc = SparkContext(conf=conf)

In [22]:
# Gather all txt file paths
root_path = "data"
txt_paths = glob.glob(os.path.join(root_path, '**', '*.txt'), recursive=True)

In [9]:
# Function to find the longest word in an RDD
def find_longest_word(file_path) -> str:
    rdd = sc.textFile(file_path)
    longest_word = (
        rdd.flatMap(lambda t: re.findall(r"\w+", t))         # Tokenize words -> r in findall means raw string
        .map(lambda w: (w, len(w)))                          # Map: Pair each word with its length
        .reduce(lambda a, b: a if a[1] > b[1] else b)        # Reduce: Find the longest word
    )
    return longest_word

In [None]:
results = {}    # {language: longest_word, ...}
for txt_path in txt_paths:
    language = os.path.basename(os.path.dirname(txt_path))  # Extract language from path
    word = find_longest_word(txt_path)                      # Find the longest word in the file
    
    if language not in results or len(word) > len(results[language]):
        results[language] = word

In [40]:
# Sort results by the length of the longest word
sorted_results = dict(sorted(results.items(), key=lambda item: len(item[1]), reverse=True))

{'German': 'Donaudampfschifffahrtsgesellschaftskapitän', 'English': 'antidisestablishmentarianism', 'Italian': 'precipitevolissimevolmente', 'French': 'anticonstitutionnellement', 'Spanish': 'electroencefalografista'}


In [41]:
# Print the results starting with the longest word
for language, word in sorted_results.items():
    print(f"{language} - {word} - {len(word)}")

German: Donaudampfschifffahrtsgesellschaftskapitän - 42
English: antidisestablishmentarianism - 28
Italian: precipitevolissimevolmente - 26
French: anticonstitutionnellement - 25
Spanish: electroencefalografista - 23
