# PART A

### CONFIGURATION

In [1]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("EricJonsson_A")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()


In [2]:
spark_context = spark_session.sparkContext

### Questions A.1
Note: I cleaned up all prints and verification steps which were not specifically requested and merged some steps into single code boxes. 

In [3]:
# 1.1, 1.2, 1.3, 1.4

# Load Textfiles
textfile_eng = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en")
textfile_swe = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv")

# Count and print number of lines and number of partitions
print('Total Number of Lines: \nEnglish Transcript: ', textfile_eng.count(), '\nSwedish Transcript: ', textfile_swe.count())
print('\nPartitions: \nEnglish Transcript: ', textfile_eng.getNumPartitions(), '\nSwedish Transcript: ', textfile_swe.getNumPartitions() )

Total Number of Lines: 
English Transcript:  1862234 
Swedish Transcript:  1862234

Partitions: 
English Transcript:  2 
Swedish Transcript:  3


### Questions A.2

In [4]:
# 2.1, 2.2, 2.3

# Function to preprocess
def pre_process(rdd):
    # Lowercase
    lowercase_rdd = rdd.map(lambda x: x.lower())
    # Split on space
    return lowercase_rdd.map(lambda x: x.split(' '))

# Apply Function to english and swedish RDD
preprocessed_eng = pre_process(textfile_eng.map(lambda x: x.lower()))
preprocessed_swe = pre_process(textfile_swe.map(lambda x: x.lower()))

# Print 10 lines from each, and the total number of lines for each text. 
print("English: \n", preprocessed_eng.take(10))
print("\nSwedish: \n", preprocessed_swe.take(10))
print('\nTotal Number of Lines: \nEnglish Transcript: ', preprocessed_eng.count(), '\nSwedish Transcript: ', preprocessed_swe.count())

English: 
 [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behal

### Questions A.3

In [5]:
# 3.1, 3.2

from operator import add

def wordcountMap(line):
    for word in line:
        return (word, 1)

# MAP
wordcount_mapstage_eng = preprocessed_eng.map(wordcountMap)
wordcount_mapstage_swe = preprocessed_swe.map(wordcountMap)
# REDUCE
wordcount_eng = wordcount_mapstage_eng.reduceByKey(add)
wordcount_swe = wordcount_mapstage_swe.reduceByKey(add)
# SORT IN DESCENDING ORDER
wordcount_eng_sorted = wordcount_eng.sortBy(ascending = False, keyfunc = lambda el: el[1]).take(10)
wordcount_swe_sorted = wordcount_swe.sortBy(ascending = False, keyfunc = lambda el: el[1]).take(10)

# PRINT
print("10 Most Common Words: English")
print(wordcount_eng_sorted, '\n')
print("10 Most Common Words: Swedish")
print(wordcount_swe_sorted)

10 Most Common Words: English
[('the', 235844), ('i', 201870), ('we', 129603), ('it', 105177), ('in', 91596), ('this', 82526), ('mr', 56025), ('that', 36863), ('as', 33289), ('\xa0\xa0', 30200)] 

10 Most Common Words: Swedish
[('jag', 207086), ('det', 198923), ('vi', 133076), ('i', 66383), ('detta', 51184), ('för', 48686), ('herr', 46659), ('den', 43543), ('de', 34602), ('men', 30438)]


### Questions A.4

In [6]:
# 4.1, 4.2
rdd_eng_1 = preprocessed_eng.zipWithIndex().map(lambda keyval: (keyval[1], keyval[0]))
rdd_swe_1 = preprocessed_swe.zipWithIndex().map(lambda keyval: (keyval[1], keyval[0]))
# 4.3
rdd_combined_1 = rdd_eng_1.join(rdd_swe_1)

# Function to check if both strings are exclusively alphanumeric
def alphanumericCheck(line):
    alpha = False
    for word in line[1][0]:
        if word.isalpha():
            alpha = True    
    for word in line[1][1]:
        if word.isalpha():
            alpha = True
    return alpha

# Apply all filters 
#         (Checks: Must contain a matching line, Lines contain less than 5 words, 
#          Both are non-empty, No empty words, must be alphanumeric, same number of words in both lines)
rdd_combined_2 = rdd_combined_1\
                .filter(lambda line: len(line[1]) == 2)\
                .filter(lambda line: len(line[1][0]) < 5)\
                .filter(lambda line: len(line[1][0]) != 0)\
                .filter(lambda line: len(line[1][1]) != 0)\
                .filter(lambda line: len(line[1][0][0]) != 0)\
                .filter(lambda line: len(line[1][1][0]) != 0)\
                .filter(lambda line: alphanumericCheck(line))\
                .filter(lambda line: len(line[1][0]) == len(line[1][1]))
rdd_combined_2.take(10)

rdd_combined_3 = rdd_combined_2.map(lambda pair: list(zip(pair[1][0],pair[1][1]))[0]).map(lambda x: ((x[0],x[1]),1))
#rdd_combined_3.sortByKey().take(10)

rdd_combined_count = rdd_combined_3.reduceByKey(add)
#rdd_combined_count.take(10)
rdd_combined_count_sorted = rdd_combined_count.sortBy(ascending = False, keyfunc = lambda el: el[1])
print("10 Most Common Translations: ")
print(rdd_combined_count_sorted.take(10))

10 Most Common Translations: 
[(('the', 'jag'), 1324), (('the', 'debatten'), 1186), (('written', 'skriftliga'), 847), (('that', 'det'), 787), (('i', 'jag'), 611), (('we', 'vi'), 533), (('this', 'detta'), 482), (('it', 'det'), 453), (('applause', 'applåder'), 429), (('this', 'det'), 321)]


A.10

The translations are unreliable but accurate enough that I believe the parsing is correct. 