In [1]:
# Imports
import os           # module to access operating system 
import findspark    # find pyspark to make it importable
import re           # regular expressions
from pyspark import SparkConf, SparkContext

In [2]:
# Initialize
findspark.init()

In [3]:
conf = SparkConf().setMaster("local[*]").setAppName("LongestWordFinder")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/13 20:38:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [13]:
# Function to find the longest word in a folder
def find_longest_word(file_path):
    rdd = sc.textFile(file_path)                            # set RDD (= resilient distributed dataset)
    longest_word = (
        rdd.flatMap(lambda t: re.findall(r"\w+", t))         # Tokenize words (r = raw string)
        .map(lambda w: (w, len(w)))                          # Map: Pair each word (key) with its length (value)
        .reduce(lambda a, b: a if a[1] > b[1] else b)        # Reduce: Find the longest word by comparison
    )
    print(longest_word)
    return longest_word

In [5]:
# Load text files from this folder
parent_folder = "/Users/sophia1/Desktop/htw-longest-word/data"

In [14]:
results = []

# Loop through each language folder
for language_folder in os.listdir(parent_folder):
    language_path = os.path.join(parent_folder, language_folder)

    # if it's a directory (it is a language folder) -> get all txt files
    if os.path.isdir(language_path):
        text_files = os.path.join(language_path, "*.txt")
        print (text_files)
        # Find the longest word in this language and save it
        longest_word, length = find_longest_word(text_files)
        results.append((language_folder, longest_word, length))

/Users/sophia1/Desktop/htw-longest-word/data/Dutch/*.txt
('landbouwgereedschappen', 22)
/Users/sophia1/Desktop/htw-longest-word/data/German/*.txt


                                                                                

('eindusendsöbenhunnertuneiunsösstig', 34)
/Users/sophia1/Desktop/htw-longest-word/data/Russian/*.txt


                                                                                

('засвидетельствованных', 21)
/Users/sophia1/Desktop/htw-longest-word/data/Italian/*.txt


                                                                                

('quattrocentoquarantatremila', 27)
/Users/sophia1/Desktop/htw-longest-word/data/English/*.txt


                                                                                

('Mekkamuselmannenmassenmenchenmoerdermohrenmuttermarmormonumentenmacher', 70)
/Users/sophia1/Desktop/htw-longest-word/data/French/*.txt


                                                                                

('constitutionnellement', 21)
/Users/sophia1/Desktop/htw-longest-word/data/Spanish/*.txt


                                                                                

('circunstanciadamente', 20)
/Users/sophia1/Desktop/htw-longest-word/data/Ukrainian/*.txt




('благочестивомудренно', 20)


                                                                                

In [12]:
# Sort results by the length of the longest word
sorted_results = sorted(results, key=lambda x: x[2], reverse=True)
print(sorted_results)

[('English', 'Mekkamuselmannenmassenmenchenmoerdermohrenmuttermarmormonumentenmacher', 70), ('German', 'eindusendsöbenhunnertuneiunsösstig', 34), ('Italian', 'quattrocentoquarantatremila', 27), ('Dutch', 'landbouwgereedschappen', 22), ('Russian', 'засвидетельствованных', 21), ('French', 'constitutionnellement', 21), ('Spanish', 'circunstanciadamente', 20), ('Ukrainian', 'благочестивомудренно', 20)]


In [11]:
for language, word, length in sorted_results:
    print(f"{language} – {word} – {length}")

English – Mekkamuselmannenmassenmenchenmoerdermohrenmuttermarmormonumentenmacher – 70
German – eindusendsöbenhunnertuneiunsösstig – 34
Italian – quattrocentoquarantatremila – 27
Dutch – landbouwgereedschappen – 22
Russian – засвидетельствованных – 21
French – constitutionnellement – 21
Spanish – circunstanciadamente – 20
Ukrainian – благочестивомудренно – 20
