# Part A - Working with the RDD API

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark_session = SparkSession.builder\
.master("spark://192.168.2.70:7077") \
.appName("derrick-adjei_app-test")\
.config("spark.dynamicAllocation.enabled", True)\
.config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
.config("spark.shuffle.service.enabled", True)\
.config("spark.dynamicAllocation.executorIdleTimeout","30s")\
.config("spark.cores.max", 4)\
.getOrCreate()



In [3]:
def add(a, b):
    # associative and commutative
    return a + b

rdd = spark_session.sparkContext.parallelize(range(10 ** 7))

result = rdd.filter(lambda x: x % 2 == 0)\
            .map(lambda x: x ** 2)\
            .reduce(add)

print(result)


[Stage 0:>                                                          (0 + 2) / 2]

166666616666670000000




## A.1.1  Read the English transcripts with Spark, and count the number of lines.

In [5]:
rdd_en = spark_session.sparkContext.textFile("hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.en", use_unicode=True,)

## A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for each).

In [6]:
rdd_sv = spark_session.sparkContext.textFile("hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.sv", use_unicode=True)

In [4]:
rdd_en.take(10)

                                                                                

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
 "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.",
 'You have requested a debate on this subject in the course of the next few days, during this part-session.',
 "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.",
 "Please rise, then, for this minute' s silence.",
 "(The House rose and observed a minute' s silence)",
 'Madam President, on a point of order.',
 'You will be aware from the press and television that there have be

In [6]:
rdd_sv.take(10)

                                                                                

['Återupptagande av sessionen',
 'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.',
 'Som ni kunnat konstatera ägde "den stora år 2000-buggen" aldrig rum. Däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga.',
 'Ni har begärt en debatt i ämnet under sammanträdesperiodens kommande dagar.',
 'Till dess vill jag att vi, som ett antal kolleger begärt, håller en tyst minut för offren för bl.a. stormarna i de länder i Europeiska unionen som drabbats.',
 'Jag ber er resa er för en tyst minut.',
 '(Parlamentet höll en tyst minut.)',
 'Fru talman! Det gäller en ordningsfråga.',
 'Ni känner till från media att det skett en rad bombexplosioner och mord i Sri Lanka.',
 'En av de personer som mycket nyligen mördades i Sri Lanka var Kumar Ponnambalam, som besökte Europaparlamentet för bara några månader se

## A. 1.3 Verify the line counts are same for the two languages

In [10]:
rdd_sv_count = rdd_sv.count()
rdd_en_count = rdd_en.count()

print(f'English: {rdd_en_count}, Swedish: {rdd_sv_count}')


English: 1862234, Swedish: 1862234


## A.1.4 Count the number of partitions.

In [11]:
# Number of partitions for RDD EN
rdd_en.getNumPartitions()

2

In [12]:
# Number of partitions for RDD EN
rdd_sv.getNumPartitions()

3

## A.2.1 Pre-process text function

In [13]:
import string
def remove_punctuation(input_string):
    # Create a string of all punctuation marks
    punctuation = string.punctuation
    
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return input_string.strip()

def pre_process(lines):
    words = lines.map(lambda line: remove_punctuation(line.lower()).split())
    
    return words

In [14]:
#A.2.1 Pre-process RDDs
rdd_sv_preprocessed = pre_process(rdd_sv)
rdd_en_preprocessed = pre_process(rdd_en)

## A.2.2 Inspect 10 entries from each RDD to verify your pre-processing

In [17]:
rdd_sv_preprocessed.take(2)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester']]

In [18]:
rdd_en_preprocessed.take(2)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period']]

## A.2.3 Verify line counts

In [19]:
rdd_en_preprocessed.count()

1862234

In [18]:
rdd_sv_preprocessed.count()

                                                                                

1862234

## A.3.1 Compute the 10 most frequently according words in the English language corpus

In [20]:
word_counts_en = rdd_en_preprocessed.flatMap(lambda line: line)\
                    .map(lambda word: (word, 1))\
                    .reduceByKey(lambda x,y: x+y)

In [21]:
word_counts_en.sortBy(lambda w: w[1], ascending=False).take(10)

[('the', 3505660),
 ('of', 1662880),
 ('to', 1545223),
 ('and', 1320012),
 ('in', 1099304),
 ('that', 839167),
 ('a', 776796),
 ('is', 774956),
 ('for', 538470),
 ('we', 526440)]

In [22]:
# Compute the 10 most frequently according words in the English language corpus
word_counts_sv = rdd_sv_preprocessed.flatMap(lambda line: line)\
                    .map(lambda word: (word, 1))\
                    .reduceByKey(lambda x,y: x+y)

In [23]:
word_counts_sv.sortBy(lambda w: w[1], ascending=False).take(10)

[('att', 1709733),
 ('och', 1350995),
 ('i', 1054584),
 ('det', 952598),
 ('som', 917546),
 ('för', 915040),
 ('av', 740741),
 ('är', 701795),
 ('en', 636518),
 ('vi', 545683)]

## A.3.2 Verify that your results are reasonable.

In [24]:
# To verify results are reasonable, we will count all the words in each corpus 
# and check if it matches our word_counts total
total_words_en = rdd_en_preprocessed.flatMap(lambda line: line)\
                    .map(lambda word: ('count', 1))\
                    .reduceByKey(lambda x,y: x+y)

[('count', 46029484)]

In [26]:
total_words_en.take(1)

[('count', 46029484)]

Total words from English Corpus is **46,029,484**

In [29]:
# Total words from our word_counts_en
word_counts_en_total_rdd = word_counts_en.map(lambda x: ('count', x[1]))\
        .reduceByKey(lambda x,y: x+y)

In [30]:
word_counts_en_total_rdd.take(1)

[('count', 46029484)]

Total words from our word count RDD is **46029484**

## A.4.1 Mine some translations in the form of word pairs, for the two languages

In [64]:
sv_1, sv_2 = pre_process(rdd_sv), pre_process(rdd_sv)
en_1, en_2 = pre_process(rdd_en), pre_process(rdd_en)

### 1. Key lines by their line number 

In [65]:
sv_1 = sv_1.zipWithIndex()
en_1 = en_1.zipWithIndex()

### 2. Swap key and values, so line number is key

In [67]:
sv_1 = sv_1.map(lambda x: (x[1],x[0]))
en_1 = en_1.map(lambda x: (x[1], x[0]))

In [68]:
## 2. verify swap worked
en_1.take(1)

[(0, ['resumption', 'of', 'the', 'session'])]

### 3 & 4 & 5 Join the two RDDs together according to the line number key, so you have pairs of matching lines
### Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
### Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation

In [69]:
translation = sv_1.filter(lambda x: len(x[1]) >= 1 and len(x[1]) <= 6 )\
        .join(en_1.filter(lambda x: len(x[1]) >= 1 and len(x[1]) <= 6 ))

### 6. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation

In [70]:
translation = translation.filter(lambda x: len(x[1][0]) == len(x[1][1]))

In [76]:
## verify translation_pair filter
translation.take(3)

[(50, (['arbetsplan'], ['agenda'])),
 (185,
  (['transport', 'av', 'farligt', 'gods', 'på', 'väg'],
   ['transport', 'of', 'dangerous', 'goods', 'by', 'road'])),
 (255,
  (['jag', 'förklarar', 'debatten', 'avslutad'],
   ['the', 'debate', 'is', 'closed']))]

### 7. For each pair, map to pair each word (in order) with the two sentences.

In [72]:
trans_pairs = translation.map(lambda l: list(zip(l[1][0], l[1][1])))

In [73]:
# verify word pairs
trans_pairs.take(3)

[[('arbetsplan', 'agenda')],
 [('transport', 'transport'),
  ('av', 'of'),
  ('farligt', 'dangerous'),
  ('gods', 'goods'),
  ('på', 'by'),
  ('väg', 'road')],
 [('jag', 'the'),
  ('förklarar', 'debate'),
  ('debatten', 'is'),
  ('avslutad', 'closed')]]

### 8. Count the number of occurrences of the word translation pairs

In [74]:
trans_pairs_count = trans_pairs.flatMap(lambda pair: pair)\
            .map(lambda pair: (pair, 1))\
            .reduceByKey(lambda x,y: x + y)

### 9. Print some of the most frequently occurring pairs of words

In [75]:
trans_pairs_count.sortBy(lambda p: p[1], ascending=False).take(20)

[(('är', 'is'), 6201),
 (('applåder', 'applause'), 3346),
 (('avslutad', 'closed'), 2994),
 (('vi', 'we'), 2361),
 (('det', 'that'), 2127),
 (('detta', 'this'), 1866),
 (('jag', 'i'), 1858),
 (('det', 'it'), 1601),
 (('inte', 'not'), 1371),
 (('jag', 'the'), 1340),
 (('debatten', 'is'), 1326),
 (('förklarar', 'debate'), 1318),
 (('debatten', 'the'), 1266),
 (('härmed', 'is'), 1248),
 (('är', 'debate'), 1219),
 (('en', 'a'), 1199),
 (('det', 'this'), 1024),
 (('är', 'are'), 967),
 (('det', 'there'), 941),
 (('artikel', 'rule'), 899)]

Translations seem reasonable except for a few mistakes.