## Run this Notebook with Spark 2 1G kernel

#### Users MUST shutdown kernels!!!

In [1]:
#Ensure we are using the right kernel
print (sc.version)

2.2.0.cloudera1


In [2]:
import time
import os
import sh
import shutil
from itertools import islice

## File / directory management from Jupyter

#### You can operate with files directories using Notebook's connection to Linux and HDFS

In [3]:
!ls -l "/home/kadochnikov/data/gutenberg/"

total 416
drwxrwxr-x 2 kadochnikov kadochnikov   4096 Nov  4  2016 tom_sawyer_bgs
-rwxr-xr-x 1 kadochnikov kadochnikov 421884 May  8  2015 tom_sawyer.txt


In [4]:
#Using Linux
!rm -r "/home/kadochnikov/data/gutenberg/tom_sawyer_wc"
!ls -lr "/home/kadochnikov/data/gutenberg"

rm: cannot remove ‘/home/kadochnikov/data/gutenberg/tom_sawyer_wc’: No such file or directory
total 416
-rwxr-xr-x 1 kadochnikov kadochnikov 421884 May  8  2015 tom_sawyer.txt
drwxrwxr-x 2 kadochnikov kadochnikov   4096 Nov  4  2016 tom_sawyer_bgs


#### Or you can use Python functionality, which allows using variables to encode path names

In [5]:
dir_src = "/home/kadochnikov/data/gutenberg/"
dir_out = "/home/kadochnikov/data/gutenberg/tom_sawyer_wc"

os.listdir(dir_src)

['tom_sawyer.txt', 'tom_sawyer_bgs']

In [6]:
#Using Python
shutil.rmtree(dir_out, ignore_errors=True)
os.listdir(dir_src)

['tom_sawyer.txt', 'tom_sawyer_bgs']

#### Also choice between shell and Python in HDFS

In [7]:
#Using Linux
!hadoop fs -ls -R "/user/kadochnikov/gutenberg"

-rw-r--r--   3 kadochnikov kadochnikov     421884 2018-04-30 14:49 /user/kadochnikov/gutenberg/tom_sawyer.txt
drwxr-xr-x   - kadochnikov kadochnikov          0 2018-04-30 14:49 /user/kadochnikov/gutenberg/tom_sawyer_bgs
drwxr-xr-x   - kadochnikov kadochnikov          0 2018-04-30 15:09 /user/kadochnikov/gutenberg/tom_sawyer_wc
-rw-r--r--   3 kadochnikov kadochnikov          0 2018-04-30 15:09 /user/kadochnikov/gutenberg/tom_sawyer_wc/_SUCCESS
-rw-r--r--   3 kadochnikov kadochnikov      73218 2018-04-30 15:09 /user/kadochnikov/gutenberg/tom_sawyer_wc/part-00000
-rw-r--r--   3 kadochnikov kadochnikov     143941 2018-04-30 15:09 /user/kadochnikov/gutenberg/tom_sawyer_wc/part-00001


In [8]:
hdfs_dir_src = "hdfs:///user/kadochnikov/gutenberg"
hdfs_dir_out = "hdfs:///user/kadochnikov/gutenberg/tom_sawyer_wc"

In [9]:
#Using Python
try:
    print(sh.hdfs('dfs','-ls',hdfs_dir_src))
except:
    print(hdfs_dir_src+' *** Does not exist ***')

Found 3 items
-rw-r--r--   3 kadochnikov kadochnikov     421884 2018-04-30 14:49 hdfs:///user/kadochnikov/gutenberg/tom_sawyer.txt
drwxr-xr-x   - kadochnikov kadochnikov          0 2018-04-30 14:49 hdfs:///user/kadochnikov/gutenberg/tom_sawyer_bgs
drwxr-xr-x   - kadochnikov kadochnikov          0 2018-04-30 15:09 hdfs:///user/kadochnikov/gutenberg/tom_sawyer_wc



In [10]:
try:
    sh.hdfs('dfs','-rmr',hdfs_dir_out)
except:
    print(hdfs_dir_out+' *** Does not exist ***')

## RDDs with text data

#### Read Tom Sawyer Book

In [11]:
!head "/home/kadochnikov/data/gutenberg/tom_sawyer.txt"

﻿
The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete by
Mark Twain (Samuel Clemens)

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.net

Title: The Adventures of Tom Sawyer, Complete


In [12]:
ts = sc.textFile("file:///home/kadochnikov/data/gutenberg/tom_sawyer.txt")
ts.cache()

file:///home/kadochnikov/data/gutenberg/tom_sawyer.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [13]:
ts.take(10)

['',
 'The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete by',
 'Mark Twain (Samuel Clemens)',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with almost',
 'no restrictions whatsoever. You may copy it, give it away or re-use',
 'it under the terms of the Project Gutenberg License included with this',
 'eBook or online at www.gutenberg.net',
 '',
 'Title: The Adventures of Tom Sawyer, Complete']

In [14]:
ts.map(lambda line: len(line.split())).take(10)

[0, 12, 4, 0, 15, 12, 12, 5, 0, 7]

In [15]:
#Split with space treats empty lines as one-word lines.  Go figure...
ts.map(lambda line: len(line.split(" "))).take(10)

[1, 12, 4, 1, 15, 12, 12, 5, 1, 7]

In [16]:
ts.map(lambda line: len(line.split())).\
reduce(lambda a, b: a if (a > b) else b)

18

In [17]:
wordCounts = ts.flatMap(lambda line: line.split()).\
map(lambda word: (word, 1)).\
reduceByKey(lambda a, b: a+b)

In [18]:
type(wordCounts)

pyspark.rdd.PipelinedRDD

#### Combine the flatMap, map, and the reduceByKey functions to do a word count of each word in the Tom Sawyer book

##### Note the familiar "parts"

In [19]:
wordCounts = ts.flatMap(lambda line: line.split()) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a+b)
wordCounts.take(5)

[('The', 414), ('Project', 78), ('EBook', 1), ('of', 1554), ('Tom', 453)]

In [20]:
#Sort in Ascending Order
wordCounts.sortByKey(ascending=True).take(5)

[('"\'Bout', 1),
 ('"\'Deed', 1),
 ('"\'My', 1),
 ('"\'Nuff!"', 1),
 ('"\'Tain\'t', 3)]

In [21]:
#Sort in Descending Order
wordCounts.sortByKey(ascending=False).take(5)

[('zephyr', 1),
 ('zenith', 1),
 ('zebras--all', 1),
 ('zeal', 1),
 ('youthful', 1)]

#### Happy with sorted results?

In [22]:
wordCountsSorted = wordCounts.map(lambda x:(x[1],x[0])).sortByKey(ascending=False)
wordCountsSorted.take(10)

[(3481, 'the'),
 (2921, 'and'),
 (1768, 'a'),
 (1750, 'to'),
 (1554, 'of'),
 (1123, 'was'),
 (935, 'in'),
 (849, 'he'),
 (787, 'that'),
 (774, 'his')]

#### Saving results into HDFS

In [23]:
wordCountsSorted.saveAsTextFile(hdfs_dir_out) 
#Won't work due to a "feature" in our Spark configuration
#wordCounts.saveAsTextFile("/home/kadochnikov/data/gutenberg/") 

In [24]:
try:
    print(sh.hdfs('dfs','-ls',hdfs_dir_out))
except:
    print(hdfs_dir_out+' *** Does not exist ***')

Found 3 items
-rw-r--r--   3 kadochnikov kadochnikov          0 2018-04-30 15:44 hdfs:///user/kadochnikov/gutenberg/tom_sawyer_wc/_SUCCESS
-rw-r--r--   3 kadochnikov kadochnikov      73218 2018-04-30 15:44 hdfs:///user/kadochnikov/gutenberg/tom_sawyer_wc/part-00000
-rw-r--r--   3 kadochnikov kadochnikov     143941 2018-04-30 15:44 hdfs:///user/kadochnikov/gutenberg/tom_sawyer_wc/part-00001



In [25]:
word_counts_saved = sc.textFile(hdfs_dir_out)
word_counts_saved.take(10)

["(3481, 'the')",
 "(2921, 'and')",
 "(1768, 'a')",
 "(1750, 'to')",
 "(1554, 'of')",
 "(1123, 'was')",
 "(935, 'in')",
 "(849, 'he')",
 "(787, 'that')",
 "(774, 'his')"]

## Some NLP Operations.  Examples of Word and N-Grams

#### The first problem is that values in each partition of our initial RDD describe lines from the file rather than sentences. Sentences may be split over multiple lines. The glom() RDD method is used to create a single entry for each document containing the list of all lines, we can then join the lines up, then resplit them into sentences using "." as the separator, using flatMap so that every object in our RDD is now a sentence. 

In [26]:
sentences = sc.textFile("file:///home/kadochnikov/data/gutenberg/tom_sawyer.txt") \
    .glom() \
    .map(lambda x: " ".join(x)) \
    .flatMap(lambda x: x.split("."))

#### Now we have isolated each sentence we can split it into a list of words and extract the word bigrams from it. Our new RDD contains tuples containing the word bigram (itself a tuple containing the first and second word) as the first value and the number 1 as the second value. 

In [27]:
bigrams = sentences.map(lambda x:x.split()) \
    .flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)])

#### Finally we can apply the same reduceByKey and sort steps that we used in the wordcount example, to count up the bigrams and sort them in order of descending frequency. In reduceByKey the key is not an individual word but a bigram. 

In [28]:
freq_bigrams = bigrams.reduceByKey(lambda x,y:x+y) \
    .map(lambda x:(x[1],x[0])) \
    .sortByKey(ascending=False)
freq_bigrams.take(10)

[(382, ('of', 'the')),
 (309, ('in', 'the')),
 (178, ('to', 'the')),
 (176, ('and', 'the')),
 (112, ('with', 'a')),
 (112, ('was', 'a')),
 (110, ('he', 'was')),
 (110, ('it', 'was')),
 (101, ('and', 'then')),
 (101, ('in', 'a'))]

#### Creating bigrams, trigrams and fourgrams in Spark

In [29]:
sentences = sc.textFile("file:///home/kadochnikov/data/gutenberg/tom_sawyer.txt") \
    .glom() \
    .map(lambda x: " ".join(x)) \
    .map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower()) \
    .map(lambda x: x.replace('gutenberg','').replace('project','').replace('literary','').replace('archive','').replace('foundation','')) \
    .flatMap(lambda x: x.split("."))
    
sentences.cache()


bigrams = sentences.map(lambda x:x.split()).\
    flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)]).\
    reduceByKey(lambda x,y:x+y).\
    map(lambda x:(x[1],x[0])).sortByKey(ascending=False)
    
trigrams = sentences.map(lambda x:x.split()).\
    flatMap(lambda x: [((x[i],x[i+1],x[i+2]),1) for i in range(0,len(x)-2)]).\
    reduceByKey(lambda x,y:x+y).\
    map(lambda x:(x[1],x[0])).sortByKey(ascending=False)
    
fourgrams = sentences.map(lambda x:x.split()).\
    flatMap(lambda x: [((x[i],x[i+1],x[i+2],x[i+3]),1) for i in range(0,len(x)-3)]).\
    reduceByKey(lambda x,y:x+y).\
    map(lambda x:(x[1],x[0])).sortByKey(ascending=False)

In [30]:
bigrams.take(10)

[(385, ('of', 'the')),
 (319, ('in', 'the')),
 (187, ('and', 'the')),
 (181, ('to', 'the')),
 (173, ('it', 'was')),
 (148, ('he', 'was')),
 (125, ('and', 'then')),
 (117, ('was', 'a')),
 (117, ('he', 'had')),
 (116, ('there', 'was'))]

In [31]:
trigrams.take(10)

[(44, ('there', 'was', 'a')),
 (31, ('by', 'and', 'by')),
 (25, ('there', 'was', 'no')),
 (22, ('out', 'of', 'the')),
 (18, ('it', 'was', 'a')),
 (17, ('he', 'did', 'not')),
 (16, ('he', 'had', 'been')),
 (15, ('to', 'be', 'a')),
 (15, ('out', 'of', 'his')),
 (14, ('and', 'began', 'to'))]

In [32]:
fourgrams.take(10)

[(9, ('the', 'rest', 'of', 'the')),
 (9, ('terms', 'of', 'this', 'agreement')),
 (8, ('the', 'terms', 'of', 'this')),
 (7, ('the', 'middle', 'of', 'the')),
 (7, ('from', 'time', 'to', 'time')),
 (6, ('a', 'moment', 'and', 'then')),
 (6, ('but', 'there', 'was', 'no')),
 (6, ('then', 'there', 'was', 'a')),
 (6, ('at', 'the', 'end', 'of')),
 (6, ('i', "don't", 'want', 'to'))]