In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("alexistubulekasA3_partA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .config("spark.cores.max",6)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("INFO")

In [3]:
#sv = spark_session.sparkContext.textFile('hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv').cache()
#en = spark_session.sparkContext.textFile('hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en').cache()

In [2]:
#from pyspark.sql import SparkSession


# New API        
#
#spark_session = SparkSession\
#        .builder\
#        .master("spark://192.168.2.113:7077")  \
#        .appName("alexistubulekasA3_partA")\
#        .config("spark.dynamicAllocation.enabled", True)\
#        .config("spark.shuffle.service.enabled", True)\
#        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
#        .config("spark.executor.cores",2)\
#        .config("spark.cores.max",2)\
#        .getOrCreate()

In [3]:
rdd_en = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': '\n'}
)\
.cache() # Keep this RDD in memory!

#A1.1
rdd_en.count()

1862234

In [3]:
#A1.4
rdd_en.getNumPartitions()

2

In [4]:
rdd_sv = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': '\n'}
)\
.cache() # Keep this RDD in memory!

#A1.2
rdd_sv.count()

1862234

In [5]:
#A1.4
rdd_sv.getNumPartitions()

3

In [4]:
rdd_en_take3 = rdd_en.take(3)
print(rdd_en_take3)

[(0, 'Resumption of the session'), (26, 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'), (234, "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.")]


In [5]:
#A2.1
def lowercase_and_split(rdd_input):
    """Takes in a rdd and outputs all the values in second position in the tuple
    lowercased and splited on space"""
    return rdd_input.map(lambda w: w[1].lower().split(' '))

# A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.
rdd_en_lower_split = lowercase_and_split(rdd_en)
rdd_sv_lower_split = lowercase_and_split(rdd_sv)

#print(rdd_en_lower_split.take(10))
#print(rdd_sv_lower_split.take(10))

# A.2.3 Verify that the line counts still match after the pre-processing
#print(rdd_en_lower_split.count())
#print(rdd_sv_lower_split.count())

In [37]:

# A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.
# A.3.2 Verify that your results are reasonable.
from operator import add

#English text
all_words_en = rdd_en_lower_split.flatMap(lambda w: w)\
    .map(lambda w: (w,1))

word_counts_en = all_words_en.reduceByKey(add)
print(word_counts_en.takeOrdered(10, key=lambda x: -x[1]))

#Swedish text
all_words_sv = rdd_sv_lower_split.flatMap(lambda w: w)\
    .map(lambda w: (w,1))

word_counts_sv = all_words_sv.reduceByKey(add)
print(word_counts_sv.takeOrdered(10, key=lambda x: -x[1]))

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [None]:
# A.4.1 
# Use this parallel corpus to mine some translations in the form of word pairs, for the two languages. Do this by pairing words found on short lines with 
# the same number of words respectively. We (incorrectly) assume the words stay in the same order when translated.

#Follow this approach. Work with the pair of RDDs you created in question A.2. Hint: make a new pair of RDDs for each step, sv_1, en_1, sv_2, en_2, ...

# 1. Key the lines by their line number (hint: ZipWithIndex()).
# 2. Swap the key and value - so that the line number is the key.
# 3. Join the two RDDs together according to the line number key, so you have pairs of matching lines.
# 4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
# 5. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation (you can experiment).
# 6. Filter to leave only pairs of sentences with the same number of words in each sentence.
# 7. For each sentence pair, map so that you pair each (in order) word in the two sentences. We no longer need the line numbers. 
# hint: use python’s built in zip() function
# 8. Use reduce to count the number of occurrences of the word-translation-pairs.
# 9. Print some of the most frequently occurring pairs of words.

In [6]:
#1. Key the lines by their line number
en_1 = rdd_en_lower_split
en_1 = en_1.zipWithIndex()

sv_1 = rdd_sv_lower_split
sv_1 = sv_1.zipWithIndex()
#print(en_1.take(2))
#print(sv_1.take(2))

range(1, 2, 10)


In [7]:
#2. Swap the key and value - so that the line number is the key
en_2 = en_1.map(lambda w: (w[1],w[0]))
sv_2 = sv_1.map(lambda w: (w[1],w[0]))


#Working with sublists to make computation easier
list_range = range(5)
en_2_sub = en_2.filter(lambda w:w[0] in list_range)
sv_2_sub = sv_2.filter(lambda w:w[0] in list_range)

join_sub = en_2_sub.join(sv_2_sub)
join_sub = join_sub.filter(lambda w:w[1][0]!=[])\
                   .filter(lambda w:w[1][1]!=[])

#join_sub.collect()

In [8]:
# 3. Join on key
joined_en_sv_3 = en_2.join(sv_2)
#joined_en_sv.take(3)


In [85]:
joined_en_sv_3.take(3)

[(565,
  (['i',
    'mean',
    'the',
    'attitude',
    'where',
    'a',
    'person',
    'wants',
    'to',
    'get',
    'on',
    'in',
    'life,',
    'whether',
    'he',
    'or',
    'she',
    'is',
    'an',
    'employee,',
    'the',
    'owner',
    'of',
    'a',
    'business',
    'or',
    'an',
    'official.'],
   ['jag',
    'menar',
    'den',
    'inställningen',
    'att',
    'en',
    'människa',
    'vill',
    'gå',
    'framåt',
    'i',
    'sitt',
    'liv',
    'oavsett',
    'om',
    'hon',
    'är',
    'arbetare,',
    'företagare',
    'eller',
    'tjänsteman.'])),
 (695,
  (['this',
    'being',
    'the',
    'case,',
    'there',
    'are',
    'considerable',
    'disparities',
    'between',
    'states,',
    'which',
    'may',
    'be',
    'measured',
    'in',
    'various',
    'ways,',
    'such',
    'as,',
    'for',
    'example,',
    'as',
    'a',
    'percentage',
    'of',
    'added',
    'value',
    'and',
    'per',
   

In [11]:
# 4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
joined_en_sv_4 = joined_en_sv_3.filter(lambda w:w[1][0]!=[''])\
                   .filter(lambda w:w[1][1]!=[''])

#joined_en_sv_4.count()

In [13]:
#5. Filter to have only pairs of sentences with a small number of words per sentence (less than 4)
joined_en_sv_5 = joined_en_sv_4.filter(lambda w:len(w[1][0])<4)\
                               .filter(lambda w:len(w[1][1])<4)

joined_en_sv_5.take(10)

[(982955, (['there', 'are', 'solutions.'], ['det', 'finns', 'lösningar.'])),
 (1381225, (['6.'], ['6.'])),
 (1425665, (['(applause)'], ['(applåder)'])),
 (1550685, (['probably', 'not!'], ['troligtvis', 'inte!'])),
 (1036910,
  (['voting', 'time', '(continuation)'], ['omröstning', '(fortsättning)'])),
 (1051495, (['10.'], ['10.'])),
 (1157290, (['10.'], ['10.'])),
 (1176380, (['(applause)'], ['(applåder)'])),
 (1176685, (['honduras', '(debate)'], ['honduras', '(debatt)'])),
 (1253985, (['thank', 'you', 'all.'], ['tack', 'alla.']))]

In [14]:
#6. Filter to leave only pairs of sentences with the same number of words in each sentence
joined_en_sv_6 = joined_en_sv_5.filter(lambda w:len(w[1][0])==len(w[1][1]))
                               

joined_en_sv_6.take(10)

[(210510, (['vote', '(continuation)'], ['omröstning', '(fortsättning)'])),
 (667265, (['\xa0\xa0', '.'], ['\xa0\xa0', '.'])),
 (860085, (['(applause)'], ['(applåder)'])),
 (644370, (['arms', 'are', 'dangerous.'], ['vapen', 'är', 'farliga.'])),
 (873825, (['(applause)'], ['(applåder)'])),
 (710235, (['\xa0\xa0', '.'], ['\xa0\xa0', '.'])),
 (891970, (['(applause)'], ['(applåder)'])),
 (226235, (['monitoring', 'of', 'bse'], ['övervakning', 'av', 'bse-krisen'])),
 (389585, (['.'], ['.'])),
 (400350, (['.'], ['.']))]

In [76]:
# release the cores for another application!
spark_context.stop()