In [1]:
# Define the root path for the data
ROOT_PATH = "/Users/sophia1/Desktop/htw-longest-word/data"

In [2]:
# Imports
import os, re
import findspark
from pyspark import SparkConf, SparkContext

In [3]:
# Initialize findspark for easier ussage
findspark.init()

In [4]:
# Setup Spark
conf = SparkConf().setMaster("local[*]").setAppName("LongestWordFinder")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/11 14:10:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/11 14:10:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/11 14:10:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
# Gather all language paths
language_paths = []
for os_entry in os.listdir(ROOT_PATH):
    # Concatenate the root path with the current entry
    language_path = os.path.join(ROOT_PATH, os_entry)

    # Check if the current entry is a directory
    if os.path.isdir(language_path):
        language_paths.append(language_path)
print(language_paths)

['/Users/sophia1/Desktop/htw-longest-word/data/Dutch', '/Users/sophia1/Desktop/htw-longest-word/data/German', '/Users/sophia1/Desktop/htw-longest-word/data/Russian', '/Users/sophia1/Desktop/htw-longest-word/data/Italian', '/Users/sophia1/Desktop/htw-longest-word/data/English', '/Users/sophia1/Desktop/htw-longest-word/data/French', '/Users/sophia1/Desktop/htw-longest-word/data/Spanish', '/Users/sophia1/Desktop/htw-longest-word/data/Ukrainian']


In [6]:
# Function to find the longest word in an RDD
def find_longest_word(file_path) -> str:
    rdd = sc.textFile(file_path)
    longest_word = (
        rdd.flatMap(lambda t: re.findall(r"\w+", t))         # Tokenize words -> r in findall means raw string
        .map(lambda w: (w, len(w)))                          # Map: Pair each word with its length
        .reduce(lambda a, b: a if a[1] > b[1] else b)        # Reduce: Find the longest word
    )
    return longest_word

In [7]:
results = {}    # {language: longest_word, ...}
for language_path in language_paths:
    language = os.path.basename(language_path)                      # Extract language from path
    word = find_longest_word(os.path.join(language_path, "*.txt"))  # Find the longest word in txt files in the language path
    
    if language not in results or len(word) > len(results[language]):
        results[language] = word

                                                                                

In [8]:
# Sort results by the length of the longest word
sorted_results = dict(sorted(results.items(), key=lambda item: len(item[1]), reverse=True))

In [9]:
# Print the results starting with the longest word
for language, word in sorted_results.items():
    print(f"{language} - {word} - {len(word)}")

Dutch - ('landbouwgereedschappen', 22) - 2
German - ('eindusendsöbenhunnertuneiunsösstig', 34) - 2
Russian - ('засвидетельствованных', 21) - 2
Italian - ('quattrocentoquarantatremila', 27) - 2
English - ('Mekkamuselmannenmassenmenchenmoerdermohrenmuttermarmormonumentenmacher', 70) - 2
French - ('constitutionnellement', 21) - 2
Spanish - ('circunstanciadamente', 20) - 2
Ukrainian - ('благочестивомудренно', 20) - 2
