In [None]:
# Last amended: 26th August, 2020
# My folder: /home/ashok/Documents/spark/1.basics
# Ref:
# Databricks:
#     https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html
# Medium
#     https://towardsdatascience.com/pyspark-and-sparksql-basics-6cb4bf967e53
# Github
#     https://github.com/pinarersoy/PySpark_SparkSQL_MLib/blob/master/PySpark%20and%20SparkSQL.ipynb    

In [None]:
# 1.0 Call libraries
from pyspark.sql import *

### More memory to Spark

In [None]:
# Check memory allocated by
# clicking on Spark_UI hyperlink
# See :  https://stackoverflow.com/a/62737941

# To allocate more memory to spark, start 
# this notebook as (and not as pysparknb):
# 
#     pyspark --driver-memory 3g 


spark

In [None]:
# 1.1 Display outputs from multiple commands
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.2 Before using Row class, have a look at its help
help(Row)

In [None]:
# 1.3 Row class
# Refer: https://spark.apache.org/docs/1.1.1/api/python/pyspark.sql.Row-class.html
# Create row of data each. Keyword is not within inverted comma
# Each Row object is unrelated to another. At the moment
# there is no DataFrame
student1 = Row(rollno = '001', name = 'rajeev' , age = 20, income = 80)
student2 = Row(rollno = '002', name = 'divakar', age = 40, income = None)  # Missing value
student3 = Row(rollno = "003", name = 'smitha',  age = 80, income = 70)
student4 = Row(rollno = None , name = 'murali',  age = 20, income = 80)     # Missing value
student5 = Row(rollno = '005', name = 'mist',    age = 40, income = 90)
student6 = Row(rollno = '006', name = "ragini",  age = 80, income = None)  # Missing value
student7 = Row(rollno = '007', name = "ravi",    age = None, income = None)     # Missing values
student8 = Row(rollno = '008', name = "pujari",  age = 47, income = 30)
student9 = Row(rollno = '009', name = "puneet",  age = 47, income = 60)
student10 =Row(rollno = '010', name = "gautam",  age = 47, income = 55)
student11 =Row(rollno = '011', name = "fauji",   age = 36, income = 21)
student12 =Row(rollno = '010', name = "gautam",  age = 36, income = 76)
student13 =Row(rollno = '012', name = "major",   age = 36, income = 32)
student14 =Row(rollno = '013', name = None,      age = 66, income = 44)      # Missing value
student15 =Row(rollno = '014', name = "rehman",  age = 66, income = 43)

In [None]:
# 1.4 Row class
# Create an object to create rows. Keywords are in inverted commas
# and there is no '=' to sign. Table of disciplines
disc = Row("disc_name", "faculty")
dept1 = disc('QT', 'Dr. Vibhu')
dept2 = disc('IT', 'Dr. Nath')
dept3 = disc('HR', 'Dr. Siva')

In [1]:
section1 = Row(students = [student1, student2, student3, student7, student8,student9] , depts  = [dept1,dept2] )
section2 = Row(students = [student1, student3, student4, student5, student6,student10], depts  = [dept2,dept3])
section3 = Row(students = [student2, student4, student6, student8, student10,student11], depts = [dept1,dept3])

NameError: name 'Row' is not defined

In [None]:
# 1.5 Extract data from Rows
#     Syntax: rowname[key]
section1
section2.students
section3['students']
section3['students'][0].name
type(student1)

### Create dataframe with list of rows

In [None]:
# 2.0 Method createDataFrame
#help(spark.createDataFrame)

In [None]:
# 2.1 Create a list of Row objects
#     and use the list to create a spark DataFrame
students1 = [student1, student2,student3,student4,student5,student6,student7,student8,student9,student10,student11]
students1 = spark.createDataFrame(students1)

In [None]:
# 2.1.1 Another DataFrame
students2 = [student13, student14,student15]
students2 = spark.createDataFrame(students2)


In [None]:
# 2.2
type(students1)
# 2.3
students1.show()
students2.show()

In [None]:
# 2.4 Two other dataframes
departments = [dept1,dept2]
departments = spark.createDataFrame(departments)
sections = [section1,section2,section3]
sections = spark.createDataFrame(sections)

In [None]:
# 2.4.1
departments.show()
sections.show()

In [None]:
# 3.0 Creating DataFrame from dictionary
#     But all dicts must be collected in a list

spark.createDataFrame([{ 'a' : 1, 'b' : 3},
                       { 'a': 4, 'b' : 7}])

In [None]:
# 3.1 This creates a DataFrame. Schema can be inferred
spark.createDataFrame([{ 'a': [1,4], 'b' : [3,7]}])
# 3.1.1 This does not create a DataFrame. Schema cannot be inferred
# spark.createDataFrame([1,2,3])


In [None]:
# 3.2 This also does not create dataframe. 
#      Schema cannot be inferred
# spark.createDataFrame({ 'a': [1,4], 'b' : [3,7]})


In [None]:
# 4.0
# Stack vertically spark dataframes. Use Union
students = students1.union(students2)
students.show()

In [None]:
# 4.1
# Ref: Spark sql functions:
#       https://spark.apache.org/docs/latest/api/sql/index.html
from pyspark.sql.functions import explode

In [None]:
explodeDF = sections.select(explode("students").alias("e"))
explodeDF.show(2,False)

In [None]:
explodeDF = sections.select(explode("students").alias("e"))
explodeDF.show()
type(explodeDF)
explodeDF.select("e").show()


In [None]:
# 4.2 selectExpr takes 
#     'SQL expressions as a string':
help(explodeDF.selectExpr)

In [None]:
# 4.3 
explodeDF.show(2)
flattenDF= explodeDF.selectExpr("e.rollno","e.name", "e.age", "e.income")
flattenDF1= explodeDF.selectExpr("e.rollno","e.name", "e.age /60 ", "e.income/100 ")
flattenDF.show()
flattenDF1.show()

In [None]:
# 4.4 Another explode() example
ashok   = Row(mobile = ["8750", "9876", "3423", "2323"],     name = "ashok"  )
pradeep = Row(mobile = ["18750", "19876", "13423", "12323"], name = "pradeep")
first = spark.createDataFrame([ashok,pradeep])
first.show()
first.select(explode("mobile")).show()

In [None]:
# 5.0 Filter conditions must be within inverted commas
flattenDF.filter("age == 20 and name = 'ragini'").show()
flattenDF.filter("age == 20 OR name = 'pujari'").show()
flattenDF.filter("age != 20 ").bshow()

In [None]:
# 5.1 Another way to filter
flattenDF[(flattenDF.age == 20)  &  (flattenDF.name == 'rajeev')].show()
flattenDF[(flattenDF.age == 20)  |  (flattenDF.name == 'pujari')].show()

In [None]:
# 6.0 Using col function
from pyspark.sql.functions import col

In [None]:
help(col)

In [None]:
# 6.1 Another way to write filter condition
flattenDF.filter(col("age") == 20).show()

In [None]:
# 6.2 USe of 'where' is same as 'filter'
flattenDF.where(col("name") == "ragini").show()

In [None]:
# 6.3 Check where age is null
flattenDF.filter(col("age").isNull()).show()

In [None]:
## 7.0 Data aggregation
# 7.1  Aggregation on all data
flattenDF.agg({"age": "sum", "income": "mean"}).show()

In [None]:
# 7.1
grouped = flattenDF.select("age", "income").groupBy("age")
grouped.agg({"income" : "mean"}).show()

In [None]:
# 7.2 Some aggregation functions
from pyspark.sql.functions import countDistinct, avg
grouped.agg(countDistinct("age")).show()
grouped.agg(avg("age")).show()

In [None]:
# 7.3 Dataframe summary
flattenDF.describe().show()
flattenDF.describe("rollno").show()

In [None]:
# 8.0 An easy way to write table schema:
#     Copy the following small file (actots.csv) 
#     to hdfs:

"""

hdfs dfs -rm -r -f /user/ashok/data_files/actors
hdfs dfs -mkdir -p /user/ashok/data_files/actors
hdfs dfs -put /home/ashok/Documents/spark/1.basics/actors.csv  /user/ashok/data_files/actors
hdfs dfs -cat /user/ashok/data_files/actors/actors.csv


"""
# 8.1 File contents are:

"""
salman,khan,9899762309,56,01-23-2001
kareena,khan,8995634675,45,12-31-2012
"""


In [None]:
# 8.2 Read the csv file
#     For date-pattern symbols (ie m, M, y, HH etc) see:
#     https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html

# 8.2.1
from pyspark.sql import *

# 8.2.2
URL_of_file = "hdfs://localhost:9000/user/ashok/data_files/actors/"

# 8.2.3
df = spark.read.csv(
                     path = URL_of_file + "actors.csv",
                     header = False,           
                     sep = ",",               
                     schema = ("fname string, lname string, phone string, age double, travel date"),
                     dateFormat = "MM-dd-yyyy"
                   )

# 8.2.4
df.show()
df.dtypes
df.schema

In [None]:
################# I am done #################