Andrew Hall

# Create a python Spark program that does the following:

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

import os

os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

import pyspark

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)

### 1. Loads each line of the text file 16-0.txt as an entry in an RDD:
https://www.gutenberg.org/cache/epub/16328/pg16328.txt

hint: you can use this in your colab notebook to download the file automatically:

`!wget -q -O 16-0.txt https://www.gutenberg.org/files/16/16-0.txt`

In [2]:
!wget -q -O 16-0.txt https://www.gutenberg.org/files/16/16-0.txt
# !wget -q -O 16-0.txt https://www.gutenberg.org/cache/epub/16328/pg16328.txt

In [3]:
file_lines_rdd = sc.textFile("16-0.txt")

### 2. Maps each RDD entry so that each entry contains a tuple of (lowercase set of letters, the original line) (hint: use python's set(), lower(), and isalpha() method on strings). For example the following line:


`"{The famous race of Spear-Danes.}"`

would be represented as the tuple

`({'a', 'c', 'd', 'e', 'f', 'h', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u'}, "{The famous race of Spear-Danes.}")`

In [4]:
def set_of_lower_letters(line: str) -> set:
  return {char.lower()
          for char in line
          if char.isalpha()}

letters_to_line_rdd = (
    file_lines_rdd
    .distinct()
    .map(lambda line: (set_of_lower_letters(line), line))
)


### 3. Finds and prints the 5 lines in the file with the highest Jaccard Similarity to your full name. Recall that Jaccard Similarity is the size of the intersection / size of the union of two sets. For example using my name "James Atlas" the output should be:

`['States.', '1859. Alliterative measures.', 'BATTLE-SARK.--Armor.', '{The gleeman sings}', '          Than the head and the handle handsome with jewels;']`

In [5]:
MY_NAME = "Andrew Hall"

my_name_set = set_of_lower_letters(MY_NAME)

In [6]:
def jaccard_similarity(set1: set, set2: set) -> float:
  return len(set1.intersection(set2)) / len(set1.union(set2))

In [7]:
### TEST ###
# This fails because the datasets are different, becuse the old one doesn't work
expected = ['States.', '1859. Alliterative measures.', 'BATTLE-SARK.--Armor.', '{The gleeman sings}', ' Than the head and the handle handsome with jewels;']
expected_to_sim = [(line, jaccard_similarity(set_of_lower_letters("James Atlas"), set_of_lower_letters(line))) for line in expected]
print(expected_to_sim)
print("Does the textfile contain 'States.':", file_lines_rdd.filter(lambda line: line == 'States.').count() > 0)

[('States.', 0.5714285714285714), ('1859. Alliterative measures.', 0.5454545454545454), ('BATTLE-SARK.--Armor.', 0.5454545454545454), ('{The gleeman sings}', 0.5454545454545454), (' Than the head and the handle handsome with jewels;', 0.5384615384615384)]
Does the textfile contain 'States.': False


In [8]:
line_to_name_similarity = (
    letters_to_line_rdd
    .map(lambda set_line: (set_line[1], jaccard_similarity(my_name_set, set_line[0])))
)


In [9]:
results = line_to_name_similarity.takeOrdered(5, lambda kv: -kv[1])

print(results)
print([line[0] for line in results])

[('and leave her here to drown.”', 0.7272727272727273), ('children are.”', 0.7), ('“Left-hander?”', 0.7), ('“Then lead the way.”', 0.7), ('when the glass thing was withdrawn.', 0.6666666666666666)]
['and leave her here to drown.”', 'children are.”', '“Left-hander?”', '“Then lead the way.”', 'when the glass thing was withdrawn.']
