In [1]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://ben-spark-master:7077") \
        .config('spark.executor.memory', '2g') \
        .config('spark.driver.maxResultSize', 0) \
        .appName("common_crawl")\
        .getOrCreate()
#        .config('spark.executor.cores', 2)\


# Old API (RDD)
spark_context = spark_session.sparkContext

# (*/*) - out of memory›
# ~6.4mins for 39496 files. (...00000/)  (takes 1 minute with 40 partitions)
# ~5 secs for 10 files (...00000/0*) 
# ~20 secs for 11110 files (...00000/1*) 

rdd = spark_context.wholeTextFiles('/mnt/nfs/ben-spark-master/teaching/crawl/CC-MAIN-2018-03/splits/CC-MAIN-20180317035630-20180317055630-00000.warc.wet/1*', minPartitions=40)\
.cache() # Keep this RDD in memory!
# Get 40 partitions here.
rdd.count()
# Only one job (previous .cache() did not trigger a job)

11110

In [None]:
rdd.getNumPartitions()

In [None]:
print(spark_context.uiWebUrl)

In [2]:
## Example #1 - Filter by TLD and compute most common words ##

# Try .ac.uk, .ru, .se, .com
p = re.compile('WARC-Target-URI: \S+\.ac.uk', re.IGNORECASE)

rdd\
.filter(lambda doc: bool(p.search(doc[1])))\
.map(lambda filename_content: filename_content[1].partition('\r\n\r\n')[2])\
.flatMap(lambda t: t.split(' '))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip()).cache()\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(100, key=lambda x: -x[1])

# .cache().take(5)

# See: http://spark.apache.org/docs/2.3.0/api/python/index.html

[('and', 245),
 ('of', 152),
 ('the', 149),
 ('to', 91),
 ('Diploma', 78),
 ('University', 74),
 ('Intermediate', 72),
 ('Research', 67),
 ('', 60),
 ('Design', 56),
 ('in', 53),
 ('a', 48),
 ('for', 47),
 ('at', 45),
 ('&', 45),
 ('|', 42),
 ('Engineering', 38),
 ('About', 36),
 ('by', 34),
 ('Browse', 34),
 ('Cambridge', 33),
 ('Department', 30),
 ('The', 30),
 ('PhD', 26),
 ('on', 25),
 ('students', 25),
 ('-', 24),
 ('Contact', 24),
 ('with', 24),
 ('research', 23),
 ('Publications', 23),
 ('your', 23),
 ('International', 22),
 ('Media', 21),
 ('News', 21),
 ('Bristol', 21),
 ('Courses', 20),
 ('Search', 19),
 ('Business', 19),
 ('How', 19),
 ('more', 19),
 ('Undergraduate', 19),
 ('Us', 19),
 ('Study', 19),
 ('Student', 19),
 ('Current', 19),
 ('you', 18),
 ('2', 18),
 ('Alumni', 18),
 ('pp.', 18),
 ('School', 18),
 ('study', 17),
 ('Warwick', 17),
 ('our', 17),
 ('Information', 17),
 ('Find', 16),
 ('Events', 16),
 ('Services', 16),
 ('2017', 16),
 ('be', 15),
 ('Boero,', 15),
 (

In [None]:
## Example #2 - Group by TLD and compute most common words for each ##

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.([a-zA-Z]{2,3})/', re.IGNORECASE)
# print(p.search(ex).group(1))
# uk

def get_tld(content):
    match = p.search(content)
    if match is not None:
        return match.group(1)
    else:
        return None

words_by_tld_rdd = rdd\
.map(lambda filename_content: filename_content[1])\
.map(lambda content: (get_tld(content), content.partition('\r\n\r\n')[2]))\
.filter(lambda tld_content: tld_content[0] is not None)\
.flatMapValues(lambda words: words.split(' '))\
.flatMapValues(lambda words: words.split('\n'))\
.mapValues(lambda word: word.strip())
#.take(10)

print(words_by_tld_rdd.take(10))

tlds = words_by_tld_rdd.countByKey()
#print(tlds)

tlds = OrderedDict(sorted(tlds.items(), key = itemgetter(1), reverse = True))
print(tlds)  

top_tlds = dict(itertools.islice(tlds.items(), 10))

print(top_tlds)

print("Results:")

for tld in top_tlds:
    print(tld)
    top_words_for_tld = words_by_tld_rdd\
        .filter(lambda tld_word: tld_word[0] == tld)\
        .values()\
        .map(lambda w: (w,1))\
        .reduceByKey(add)\
        .takeOrdered(20, key=lambda x: -x[1])
    print(top_words_for_tld)

In [None]:
#file_content = rdd.take(1)[0][1]
#print(file_content.partition('\r\n\r\n')[2])
from operator import add
import re

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.(([a-zA-Z]{2,3}}\.)?[a-zA-Z]{2,3}})/', re.IGNORECASE)

print(p.search(ex))

#print(bool(p.search('\nWARC-Target-URI:\n')))

#rdd\
#.filter(lambda doc: bool(p.search(doc[1])))\
#.map(lambda filename_content: filename_content[1].partition('\r\n\r\n')[2])\
#.flatMap(lambda t: t.split(' '))\
#.flatMap(lambda w: w.split('\n'))\
#.map(lambda w: w.strip())\
#.map(lambda w: (w,1))\
#.reduceByKey(add)\
#.takeOrdered(100, key=lambda x: -x[1])
#.take(100)
#.take(10)
#.flatMap(lambda text: text.split(' ')).take(100)

In [None]:
spark_session.stop()