In [None]:
from pyspark.sql import SparkSession

session = SparkSession.builder.appName("mysparkapp").getOrCreate()
schema = ['name','age','dept','sal']
data = [('Ashutosh',25,'Hr',2400),
        ('Nikita',23,'BA',6700),
        ('Raju',45,'kr',9089)]

df = session.createDataFrame(data,schema)

In [2]:
df.show()

                                                                                

+--------+---+----+----+
|    name|age|dept| sal|
+--------+---+----+----+
|Ashutosh| 25|  Hr|2400|
|  Nikita| 23|  BA|6700|
|    Raju| 45|  kr|9089|
+--------+---+----+----+



In [3]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- dept: string (nullable = true)
 |-- sal: long (nullable = true)



In [4]:
from pyspark.sql.functions import col,sum

t1 = df.agg(sum(col('sal')))
t1.show()

+--------+
|sum(sal)|
+--------+
|   18189|
+--------+



In [6]:
t2 = df.groupBy(col('dept')).agg(sum(col('sal')).alias('total'))
t2.show()

+----+-----+
|dept|total|
+----+-----+
|  Hr| 2400|
|  BA| 6700|
|  kr| 9089|
+----+-----+



In [7]:
t3 = df.filter(col('sal') > 2400).select('name','sal')
t3.show()

+------+----+
|  name| sal|
+------+----+
|Nikita|6700|
|  Raju|9089|
+------+----+



In [8]:
t4 = df.withColumn('bonus',col('sal')*10)
t4.show()

+--------+---+----+----+-----+
|    name|age|dept| sal|bonus|
+--------+---+----+----+-----+
|Ashutosh| 25|  Hr|2400|24000|
|  Nikita| 23|  BA|6700|67000|
|    Raju| 45|  kr|9089|90890|
+--------+---+----+----+-----+



In [15]:
from pyspark.sql.functions import count,max
t5 = df.agg(count(col('sal')).alias('count'))
t5.show()

+-----+
|count|
+-----+
|    3|
+-----+



In [17]:
t6 = df.agg(max(col('sal')))
t6.show()

+--------+
|max(sal)|
+--------+
|    9089|
+--------+



In [None]:
sqldf = session.sql("select * from {df}",df=df)
sqldf.show()

+--------+
|max(sal)|
+--------+
|    9089|
+--------+



In [36]:
session.stop()