In [1]:
!pip install findspark
!pip install pyspark
import findspark
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import *
spark = SparkSession.builder.getOrCreate() 
findspark.init()
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)



#### Dataframe creation

In [2]:
# creating employee database with row function, column names
Employee = Row("fisrtname", "lastname", "email", "salary")

In [3]:
Employee

<Row(fisrtname, lastname, email, salary)>

In [4]:
# create some employee data, same format
employee1 = Employee('Basher', 'armbrust', 'bash@edureka.co', 100000)
employee2 = Employee('Daniel', 'meng', 'daniel@stanford.edu', 120000)
employee3 = Employee('Muriel', None, 'muriel@waterloo.edu', 140000)
employee4 = Employee('Rachel', 'wendell', 'rach_3@edureka.co', 160000)
employee5 = Employee('Zach', 'galifianakis', 'zach_g@edureka.co', 160000)

In [5]:
print(employee4)

Row(fisrtname='Rachel', lastname='wendell', email='rach_3@edureka.co', salary=160000)


In [6]:
# department = Row("id", "name")

# department1 = department('123456','HR')
# department2 = department('789012','OPS')
# department3 = department('345678','FN')
# department4 = department('901234','DEV')


department1 = Row(id='123456', name='HR')
department2 = Row(id='789012', name='OPS')
department3 = Row(id='345678', name='FN')
department4 = Row(id='901234', name='DEV')

In [7]:
print(department4)

Row(id='901234', name='DEV')


In [8]:
departmentWithEmployee = Row("department", "Employees")

departmentWithEmployee_1 = departmentWithEmployee(department1, [employee1,employee2])
departmentWithEmployee_2 = departmentWithEmployee(department2, [employee2,employee3,employee4])
departmentWithEmployee_3 = departmentWithEmployee(department3, [employee4,employee1])
departmentWithEmployee_4 = departmentWithEmployee(department4, [employee3])

In [9]:
print(departmentWithEmployee_2)

Row(department=Row(id='789012', name='OPS'), Employees=[Row(fisrtname='Daniel', lastname='meng', email='daniel@stanford.edu', salary=120000), Row(fisrtname='Muriel', lastname=None, email='muriel@waterloo.edu', salary=140000), Row(fisrtname='Rachel', lastname='wendell', email='rach_3@edureka.co', salary=160000)])


In [10]:
# creating dataframe
departmentWithEmployee_df = [departmentWithEmployee_1, departmentWithEmployee_2]
df1 = spark.createDataFrame(departmentWithEmployee_df)

departmentWithEmployee_df = [departmentWithEmployee_2, departmentWithEmployee_4]
df2 = spark.createDataFrame(departmentWithEmployee_df)

In [11]:
df1.show()

+-------------+--------------------+
|   department|           Employees|
+-------------+--------------------+
| [123456, HR]|[[Basher, armbrus...|
|[789012, OPS]|[[Daniel, meng, d...|
+-------------+--------------------+



In [12]:
display(df2)

DataFrame[department: struct<id:string,name:string>, Employees: array<struct<fisrtname:string,lastname:string,email:string,salary:bigint>>]

In [13]:
df2.show()

+-------------+--------------------+
|   department|           Employees|
+-------------+--------------------+
|[789012, OPS]|[[Daniel, meng, d...|
|[901234, DEV]|[[Muriel,, muriel...|
+-------------+--------------------+



In [14]:
df1.collect()

[Row(department=Row(id='123456', name='HR'), Employees=[Row(fisrtname='Basher', lastname='armbrust', email='bash@edureka.co', salary=100000), Row(fisrtname='Daniel', lastname='meng', email='daniel@stanford.edu', salary=120000)]),
 Row(department=Row(id='789012', name='OPS'), Employees=[Row(fisrtname='Daniel', lastname='meng', email='daniel@stanford.edu', salary=120000), Row(fisrtname='Muriel', lastname=None, email='muriel@waterloo.edu', salary=140000), Row(fisrtname='Rachel', lastname='wendell', email='rach_3@edureka.co', salary=160000)])]

In [15]:
# union
unionDf = df1.union(df2)

# display(unionDf)

unionDf.show()

+-------------+--------------------+
|   department|           Employees|
+-------------+--------------------+
| [123456, HR]|[[Basher, armbrus...|
|[789012, OPS]|[[Daniel, meng, d...|
|[789012, OPS]|[[Daniel, meng, d...|
|[901234, DEV]|[[Muriel,, muriel...|
+-------------+--------------------+



In [None]:
intersectionDf = df1.intersect(df2)

# display(intersectionDf)

intersectionDf.show()

In [None]:
unionDf.select('Employees').show()

In [None]:
# explode - Returns a new row for each element in the given array
explodeDf = unionDf.select(explode('Employees').alias('Emp'))

In [None]:
explodeDf.show()

In [None]:
# variant of select, accept SQL expressions
flatDf = explodeDf.selectExpr("Emp.fisrtname", "Emp.lastname", "Emp.email","Emp.salary")

In [None]:
flatDf.show()

#### Reading Data from CSV file and performing actions

In [None]:
# Reading data from csv
house_df = spark.read.csv("house_data.csv", inferSchema= True, header = True)

In [None]:
# print results
house_df.show(3)

In [None]:
# return list of rows
house_df.take(3)

In [None]:
# to get structure of dataframe
house_df.printSchema() 

In [None]:
# column names
house_df.columns  

In [None]:
# row count 
house_df.count()

In [None]:
# column count
len(house_df.columns)

In [None]:
# to get statistical summary of given column
house_df.describe('num_rooms').show()

In [None]:
type(house_df)

In [None]:
# to select columns
house_df.select('lat','long').show(5)

In [None]:
house_df.filter(house_df.price == 999000).show()

In [None]:
house_df.filter( (col('build_year')==2010) | (col('build_year')==2018)).show(3)

In [None]:
house_df.where(col('num_rooms').isNull()).show()

In [None]:
house_df.select(col('build_year')).where(col('num_rooms')==4).show()

In [None]:
house_df.select('num_rooms').show(10)

In [None]:
# distinct values of a column
house_df.select('num_rooms').distinct().show()

In [None]:
house_df.filter(house_df.num_rooms==7).count()

In [None]:
house_df.filter((house_df.num_rooms>=3) & (house_df.num_rooms<=6)).show(5)

In [None]:
# to sort data, by default sorts in ascending order
house_df.orderBy(house_df.num_rooms,ascending = False).show(5)

In [None]:
house_df.filter(house_df.build_year == 2015).orderBy(house_df.num_rooms,ascending=False).show(7)

In [None]:
house_df.orderBy(house_df.num_rooms).show(5)

In [None]:
house_df.filter(house_df.build_year == 2015).count()

In [None]:
# to group dataframe based on column
house_df.groupBy(house_df.build_year).count().show()

In [None]:
house_df.groupBy(col('build_year'),col('num_rooms')).count().orderBy(col('build_year'),ascending=False).show(5)

In [None]:
house_df.filter(col('build_year')==2020).agg({'population_in_hectare':'sum'}).show()

#### Performing SQL Queries

In [None]:
# passing SQL queries directly to any dataframe
# create table from dataframe
house_df.registerTempTable('house_table') 

In [None]:
# pass sql query
sqlContext.sql('select * from house_table').show(5)

In [None]:
sqlContext.sql('select distinct(num_rooms) from house_table').show()

In [None]:
sqlContext.sql('select max(build_year) from house_table').show()