## 01-pyspark-rdd-wordcount-2.py

In [0]:
# 01-pyspark-rdd-wordcount-2.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()
data = ["Better safe than sorry",
        "Blood is thicker than water",
        "All that glitters is not gold",
        "Charity begins at home",
        "Beggars can not be choosers"]
rdd = spark.sparkContext.parallelize(data)

for element in rdd.collect():
    print("Proverb ->", element)

Proverb -> Better safe than sorry
Proverb -> Blood is thicker than water
Proverb -> All that glitters is not gold
Proverb -> Charity begins at home
Proverb -> Beggars can not be choosers


In [0]:
# flatMap    
rdd2 = rdd.flatMap(lambda x: x.split(" "))
print (rdd2.collect())
for element in rdd2.collect():
    print(element)

['Better', 'safe', 'than', 'sorry', 'Blood', 'is', 'thicker', 'than', 'water', 'All', 'that', 'glitters', 'is', 'not', 'gold', 'Charity', 'begins', 'at', 'home', 'Beggars', 'can', 'not', 'be', 'choosers']
Better
safe
than
sorry
Blood
is
thicker
than
water
All
that
glitters
is
not
gold
Charity
begins
at
home
Beggars
can
not
be
choosers


In [0]:
# map
rdd3 = rdd.map(lambda x: x.split(" "))
print (rdd3.collect())
for element in rdd3.collect():
    print(element)
    
print ("")

rdd3 = rdd2.map(lambda x: (x, 1))
print (rdd3.collect())
for element in rdd3.collect():
    print(element)

[['Better', 'safe', 'than', 'sorry'], ['Blood', 'is', 'thicker', 'than', 'water'], ['All', 'that', 'glitters', 'is', 'not', 'gold'], ['Charity', 'begins', 'at', 'home'], ['Beggars', 'can', 'not', 'be', 'choosers']]
['Better', 'safe', 'than', 'sorry']
['Blood', 'is', 'thicker', 'than', 'water']
['All', 'that', 'glitters', 'is', 'not', 'gold']
['Charity', 'begins', 'at', 'home']
['Beggars', 'can', 'not', 'be', 'choosers']

[('Better', 1), ('safe', 1), ('than', 1), ('sorry', 1), ('Blood', 1), ('is', 1), ('thicker', 1), ('than', 1), ('water', 1), ('All', 1), ('that', 1), ('glitters', 1), ('is', 1), ('not', 1), ('gold', 1), ('Charity', 1), ('begins', 1), ('at', 1), ('home', 1), ('Beggars', 1), ('can', 1), ('not', 1), ('be', 1), ('choosers', 1)]
('Better', 1)
('safe', 1)
('than', 1)
('sorry', 1)
('Blood', 1)
('is', 1)
('thicker', 1)
('than', 1)
('water', 1)
('All', 1)
('that', 1)
('glitters', 1)
('is', 1)
('not', 1)
('gold', 1)
('Charity', 1)
('begins', 1)
('at', 1)
('home', 1)
('Beggars', 1

In [0]:
# reduceByKey
rdd4 = rdd3.reduceByKey(lambda a, b: a + b)
for element in rdd4.collect():
    print(element, end = ", ")

('Better', 1), ('water', 1), ('glitters', 1), ('that', 1), ('choosers', 1), ('than', 2), ('begins', 1), ('at', 1), ('home', 1), ('sorry', 1), ('All', 1), ('safe', 1), ('Charity', 1), ('be', 1), ('is', 2), ('thicker', 1), ('gold', 1), ('Beggars', 1), ('Blood', 1), ('not', 2), ('can', 1), 

In [0]:
# map
rdd5 = rdd4.map(lambda x: (x[1], x[0])).sortByKey()
for element in rdd5.collect():
    print(element, end = ", ")

(1, 'Better'), (1, 'water'), (1, 'glitters'), (1, 'that'), (1, 'choosers'), (1, 'begins'), (1, 'at'), (1, 'home'), (1, 'sorry'), (1, 'All'), (1, 'safe'), (1, 'Charity'), (1, 'be'), (1, 'thicker'), (1, 'gold'), (1, 'Beggars'), (1, 'Blood'), (1, 'can'), (2, 'than'), (2, 'is'), (2, 'not'), 

In [0]:
# filter
rdd6 = rdd5.filter(lambda x: 'a' in x[1])
for element in rdd6.collect():
    print(element)

(1, 'water')
(1, 'that')
(1, 'at')
(1, 'safe')
(1, 'Charity')
(1, 'Beggars')
(1, 'can')
(2, 'than')


In [0]:
from pyspark.sql.functions import col, expr
data = [("2019-01-23", 1), ("2019-06-24", 2), ("2019-09-20", 3)]
spark.createDataFrame(data).toDF("date", "increment").show()

spark.createDataFrame(data).toDF("date", "increment") \
     .select(col("date"), col("increment"), \
      expr("add_months(to_date(date, 'yyyy-MM-dd'), cast(increment as int))").alias("inc_date")) \
     .show()

+----------+---------+
|      date|increment|
+----------+---------+
|2019-01-23|        1|
|2019-06-24|        2|
|2019-09-20|        3|
+----------+---------+

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+

