In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("xiaoxia_a3a")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/14 13:45:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/14 13:45:41 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# A1
# Load files from HDFS

en_1 = spark_context\
       .textFile("hdfs://host-192-168-2-119-de1:9000/europarl/europarl-v7.sv-en.en")\
       .cache()
sv_1 = spark_context\
       .textFile("hdfs://host-192-168-2-119-de1:9000/europarl/europarl-v7.sv-en.sv")\
       .cache()

print(en_1.first())
print(sv_1.first())

                                                                                

Resumption of the session


[Stage 1:>                                                          (0 + 1) / 1]

Återupptagande av sessionen


                                                                                

In [3]:
# A1
lines_num_en = en_1.count()
print(f'The number of lines (en): {lines_num_en}')
print(en_1.getNumPartitions()) 

lines_num_sv = sv_1.count()
print(f'The number of lines (sv): {lines_num_sv}')
print(sv_1.getNumPartitions()) 

                                                                                

The number of lines (en): 1862234
2




The number of lines (sv): 1862234
3


                                                                                

In [4]:
# A2 Lowercase the text, and Tokenize the text (split on space)
def func_a2(lines):
    lines = lines.lower()
    lines = lines.split(' ')
    return lines

en_2 = en_1.map(func_a2)

print(en_2.count())
en_2.take(10)

[Stage 4:>                                                          (0 + 2) / 2]

1862234


                                                                                

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [5]:
# A2_sv
sv_2 = sv_1.map(func_a2)

print(sv_2.count())
sv_2.take(10)



1862234


                                                                                

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar.'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi,',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt,',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'f

In [31]:
# A3 The 10 most frequently according words in the English language corpus

all_words_en = en_1\
            .map(lambda s: s.lower())\
            .flatMap(lambda t: t.split(' '))\
            .flatMap(lambda w: w.split('\n'))

all_words_count_en = all_words_en\
            .map(lambda w: w.strip())\
            .map(lambda w: (w,1))

word_counts_en = all_words_count_en.reduceByKey(add)

print('The 10 most frequently according words in the English language corpus: \n')
print(word_counts_en.takeOrdered(10, key=lambda x: -x[1]))

The 10 most frequently according words in the English language corpus: 





[('the', 3498452), ('of', 1659758), ('to', 1539760), ('and', 1288402), ('in', 1085994), ('that', 797519), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522851)]


                                                                                

In [32]:
# A3 The 10 most frequently according words in the Swedish language corpus

all_words_sv = sv_1\
            .map(lambda s: s.lower())\
            .flatMap(lambda t: t.split(' '))\
            .flatMap(lambda w: w.split('\n'))

all_words_count_sv = all_words_sv\
            .map(lambda w: w.strip())\
            .map(lambda w: (w,1))

word_counts_sv = all_words_count_sv.reduceByKey(add)

print('The 10 most frequently according words in the Swedish language corpus: \n')
print(word_counts_sv.takeOrdered(10, key=lambda x: -x[1]))

The 10 most frequently according words in the Swedish language corpus: 





[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924868), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


                                                                                

In [38]:
# A4
# Key the lines by their line number

en_index = en_1.zipWithIndex()
print(en_index.take(5))

sv_index = sv_1.zipWithIndex()
print(sv_index.take(5))

                                                                                

[('Resumption of the session', 0), ('I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', 1), ("Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 2), ('You have requested a debate on this subject in the course of the next few days, during this part-session.', 3), ("In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.", 4)]




[('Återupptagande av sessionen', 0), ('Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.', 1), ('Som ni kunnat konstatera ägde "den stora år 2000-buggen" aldrig rum. Däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga.', 2), ('Ni har begärt en debatt i ämnet under sammanträdesperiodens kommande dagar.', 3), ('Till dess vill jag att vi, som ett antal kolleger begärt, håller en tyst minut för offren för bl.a. stormarna i de länder i Europeiska unionen som drabbats.', 4)]


                                                                                

In [39]:
# Swap the key and value
en_swap = en_index.map(lambda x: (x[1], x[0]))
print(en_swap.take(5))

print("")

sv_swap = sv_index.map(lambda x: (x[1], x[0]))
print(sv_swap.take(5))

[(0, 'Resumption of the session'), (1, 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'), (2, "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."), (3, 'You have requested a debate on this subject in the course of the next few days, during this part-session.'), (4, "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.")]

[(0, 'Återupptagande av sessionen'), (1, 'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni

In [42]:
# Join the two RDDs together
en_sv = en_swap.join(sv_swap).sortBy(lambda x: x[0])
print(en_sv.take(5))
print(en_sv.count())

                                                                                

[(0, ('Resumption of the session', 'Återupptagande av sessionen')), (1, ('I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', 'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.')), (2, ("Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'Som ni kunnat konstatera ägde "den stora år 2000-buggen" aldrig rum. Däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga.')), (3, ('You have requested a debate on this subject in the course of the next few days, during this part-session.', 'Ni har begärt en debatt 



1862234


                                                                                

In [65]:
# Filter, select the sentences that have less than six words
def func_a4(lines):
    lines = lines.lower()
    lines = lines.split(' ')
    num = len(lines)
    return num

en_sv_filter = en_sv\
    .map(lambda x: x[1])\
    .filter(lambda y: (func_a4(y[0]) < 6) and (func_a4(y[0]) > 0))\
    .filter(lambda z: (func_a4(z[1]) < 6) and (func_a4(z[1]) > 0))\
    .filter(lambda w: (func_a4(w[0]) == func_a4(w[1])))

print(en_sv_filter.take(3))
print(en_sv_filter.count())

                                                                                

[('Agenda', 'Arbetsplan'), ('That did not happen.', 'Så blev inte fallet.'), ('This is an important matter.', 'Detta är en viktig fråga.')]




38016


                                                                                

In [77]:
en_sv_filter_en = en_sv_filter.flatMap(lambda x: func_a2(x[0]))
en_sv_filter_sv = en_sv_filter.flatMap(lambda y: func_a2(y[1]))

print(en_sv_filter_en.take(10))
print(en_sv_filter_sv.take(10))

                                                                                

['agenda', 'that', 'did', 'not', 'happen.', 'this', 'is', 'an', 'important', 'matter.']


[Stage 214:>                                                        (0 + 1) / 1]

['arbetsplan', 'så', 'blev', 'inte', 'fallet.', 'detta', 'är', 'en', 'viktig', 'fråga.']


                                                                                

In [78]:
en_sv_zip = en_sv_filter_en.zip(en_sv_filter_sv)
print(en_sv_zip.take(10))

[Stage 217:>                                                        (0 + 1) / 1]

[('agenda', 'arbetsplan'), ('that', 'så'), ('did', 'blev'), ('not', 'inte'), ('happen.', 'fallet.'), ('this', 'detta'), ('is', 'är'), ('an', 'en'), ('important', 'viktig'), ('matter.', 'fråga.')]


                                                                                

In [80]:
en_sv_pairs = en_sv_zip.map(lambda w: (w,1))
en_sv_pairs_counts = en_sv_pairs.reduceByKey(add)

# The 10 most frequently occurring pairs of words
print('The 10 most frequently occurring pairs of words: \n')
print(en_sv_pairs_counts.takeOrdered(10, key=lambda x: -x[1]))

The 10 most frequently occurring pairs of words: 





[(('is', 'är'), 4699), (('closed.', 'avslutad.'), 2951), (('(applause)', '(applåder)'), 2546), (('', '.'), 2223), (('.', '.'), 2084), (('that', 'det'), 1494), (('we', 'vi'), 1443), (('the', 'jag'), 1336), (('is', 'debatten'), 1327), (('debate', 'förklarar'), 1319)]


                                                                                

In [81]:
# release the cores for another application!
spark_context.stop()