## Session Initialization

In [1]:
from pathlib import Path
import pyspark
from pyspark import SparkContext

sc = SparkContext()
spark = pyspark.sql.SparkSession(sc, jsparkSession=None)

## Load data into DataFrame

In [2]:
path_1 = Path.cwd().parent / "Files" / "bookcontents.csv"
book = str(path_1)

In [3]:
# Load Chapters
bookChaptersDF = spark.read.option("inferSchema","true").option("header","true").csv(book)
bookChaptersDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Create RDD

In [4]:
# Create RDD from DataFrame
bookRDD = bookChaptersDF.rdd

In [5]:
# Inspect selection of RDD
for row in bookRDD.take(5): 
        print(row) 

Row(Chapter=1, Name='Introduction', Page=11)
Row(Chapter=2, Name='Basic Engineering Skills', Page=19)
Row(Chapter=3, Name='Advanced Engineering Skills', Page=28)
Row(Chapter=4, Name='Hands On Course', Page=60)
Row(Chapter=5, Name='Case Studies', Page=62)


In [6]:
# Inspect all RDD
for row in bookRDD.collect(): 
        print(row) 

Row(Chapter=1, Name='Introduction', Page=11)
Row(Chapter=2, Name='Basic Engineering Skills', Page=19)
Row(Chapter=3, Name='Advanced Engineering Skills', Page=28)
Row(Chapter=4, Name='Hands On Course', Page=60)
Row(Chapter=5, Name='Case Studies', Page=62)
Row(Chapter=6, Name='Best Practices Cloud Platforms', Page=73)
Row(Chapter=7, Name='130+ Data Sources Data Science', Page=77)
Row(Chapter=8, Name='1001 Interview Questions', Page=82)
Row(Chapter=9, Name='Recommended Books and Courses', Page=87)


## Modify RDD to create compound column

In [7]:
# Create compound column
splitRDD = bookRDD.map(lambda x: (x[0], (str(x[2]) + "/" + x[1]) ))

In [8]:
# Inspect new RDD
for row in splitRDD.collect():
    print(row)

(1, '11/Introduction')
(2, '19/Basic Engineering Skills')
(3, '28/Advanced Engineering Skills')
(4, '60/Hands On Course')
(5, '62/Case Studies')
(6, '73/Best Practices Cloud Platforms')
(7, '77/130+ Data Sources Data Science')
(8, '82/1001 Interview Questions')
(9, '87/Recommended Books and Courses')


## Turn RDD back to DataFrame

In [9]:
# Create schema for DataFrame
from pyspark.sql.types import *
compoundSchema = StructType([
StructField("Chapter",IntegerType()),
StructField("Compound",StringType()),
])

In [10]:
# Create DataFrame
compoundDF = spark.createDataFrame(splitRDD,compoundSchema)

In [11]:
compoundDF.show()

+-------+--------------------+
|Chapter|            Compound|
+-------+--------------------+
|      1|     11/Introduction|
|      2|19/Basic Engineer...|
|      3|28/Advanced Engin...|
|      4|  60/Hands On Course|
|      5|     62/Case Studies|
|      6|73/Best Practices...|
|      7|77/130+ Data Sour...|
|      8|82/1001 Interview...|
|      9|87/Recommended Bo...|
+-------+--------------------+



## Counting words

In [12]:
path_2 = Path.cwd().parent / "Files" / "sections_wordcount.csv"
section = str(path_2)

In [13]:
# Read file into RDD
sectionsRDD = sc.textFile(section)

In [14]:
# Inspect new RDD
for row in sectionsRDD.take(5):
    print(row)

1,1.1,What is this Cookbook
1,1.2,Data Engineer vs Data Scientist
1,1.3,My Data Science Platform Blueprint
1,1.4,Who Companies Need
2,2.1,Learn To Code


In [15]:
# Split each row
playRDD = sectionsRDD.map(lambda columns: columns.split(","))
                              #(columns[2]))

In [16]:
# Inspect new RDD
for row in playRDD.take(5):
    print(row)

['1', '1.1', 'What is this Cookbook']
['1', '1.2', 'Data Engineer vs Data Scientist']
['1', '1.3', 'My Data Science Platform Blueprint']
['1', '1.4', 'Who Companies Need']
['2', '2.1', 'Learn To Code']


In [17]:
# only take 3rd column (text)
selecttextRDD = playRDD.map(lambda columns: columns[2])

In [18]:
# Inspect new RDD
for row in selecttextRDD.take(5):
    print(row)

What is this Cookbook
Data Engineer vs Data Scientist
My Data Science Platform Blueprint
Who Companies Need
Learn To Code


In [19]:
# flatten RDD
flatRDD = selecttextRDD.flatMap(lambda text: text.split(" ")).map(lambda word: (word,1))

In [20]:
# Inspect new RDD
for row in flatRDD.take(5):
    print(row)

('What', 1)
('is', 1)
('this', 1)
('Cookbook', 1)
('Data', 1)


In [21]:
# Count the Words and sort by key
reducedRDD = flatRDD.reduceByKey(lambda v1,v2: v1+v2).sortByKey()

In [22]:
# Inspect new RDD
for row in reducedRDD.take(20):
    print(row)

('(AWS)', 1)
('(GCP)', 1)
('A', 2)
('API', 1)
('About', 1)
('Academic', 1)
('Agile', 1)
('Airbnb', 1)
('Amazon', 2)
('And', 6)
('Apache', 3)
('Articles', 1)
('Azure', 1)
('BMW', 1)
('Baidu', 1)
('Blackrock', 1)
('Blog', 1)
('Blueprint', 1)
('Booking.com', 1)
('Books', 2)


## Turn RDD back into DataFrame

In [23]:
# Create schema for Wordcount DataFrame
from pyspark.sql.types import *
wordcountSchema = StructType([
StructField("Word",StringType()),
StructField("Count",IntegerType())])

In [24]:
# Create DataFrame
wordcountDF = spark.createDataFrame(reducedRDD,wordcountSchema)
wordcountDF.show()

+-----------+-----+
|       Word|Count|
+-----------+-----+
|      (AWS)|    1|
|      (GCP)|    1|
|          A|    2|
|        API|    1|
|      About|    1|
|   Academic|    1|
|      Agile|    1|
|     Airbnb|    1|
|     Amazon|    2|
|        And|    6|
|     Apache|    3|
|   Articles|    1|
|      Azure|    1|
|        BMW|    1|
|      Baidu|    1|
|  Blackrock|    1|
|       Blog|    1|
|  Blueprint|    1|
|Booking.com|    1|
|      Books|    2|
+-----------+-----+
only showing top 20 rows



In [25]:
# Sort the DataFrame after Count column
wordcountDF.sort(wordcountDF.Count.desc()).show()

+-----------+-----+
|       Word|Count|
+-----------+-----+
|       Data|   48|
|    Science|   40|
|        And|    6|
|   Platform|    3|
| Processing|    3|
|     Apache|    3|
|        and|    3|
|       What|    2|
|     Amazon|    2|
|      Books|    2|
|    Courses|    2|
|         to|    2|
|       Nifi|    2|
|    Twitter|    2|
|   Security|    2|
|         To|    2|
|      Learn|    2|
|      Cloud|    2|
|          A|    2|
|Development|    2|
+-----------+-----+
only showing top 20 rows

