In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Basic2').getOrCreate()



In [5]:
df = spark.read.csv('test2.csv', header=True, inferSchema=True)

In [6]:
df.show()

+----+----+----------+------+
|name| age|experience|salary|
+----+----+----------+------+
|   A|   2|         5|1000.0|
|   B|   4|        10| 150.0|
|   C|   6|        20|2000.0|
|   D|null|        24| 123.0|
|   E|   8|      null| 300.2|
|null|   2|      null| 150.0|
+----+----+----------+------+



In [7]:
# Salary of people less than 500
df.filter('salary<=500').show()

+----+----+----------+------+
|name| age|experience|salary|
+----+----+----------+------+
|   B|   4|        10| 150.0|
|   D|null|        24| 123.0|
|   E|   8|      null| 300.2|
|null|   2|      null| 150.0|
+----+----+----------+------+



In [11]:
df.filter('salary<=500 and age>2').show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   B|  4|        10| 150.0|
|   E|  8|      null| 300.2|
+----+---+----------+------+



In [12]:
df.filter(df['salary']<500).show()

+----+----+----------+------+
|name| age|experience|salary|
+----+----+----------+------+
|   B|   4|        10| 150.0|
|   D|null|        24| 123.0|
|   E|   8|      null| 300.2|
|null|   2|      null| 150.0|
+----+----+----------+------+



In [14]:
df.filter((df['salary']<500) & (df['age']>2)).show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   B|  4|        10| 150.0|
|   E|  8|      null| 300.2|
+----+---+----------+------+



In [16]:
# Using not operation
df.filter(~(df['salary']<500)).show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   A|  2|         5|1000.0|
|   C|  6|        20|2000.0|
+----+---+----------+------+



In [17]:
df[~(df['salary']<500)].show()

+----+---+----------+------+
|name|age|experience|salary|
+----+---+----------+------+
|   A|  2|         5|1000.0|
|   C|  6|        20|2000.0|
+----+---+----------+------+



In [18]:
df[df['salary']<500].show()

+----+----+----------+------+
|name| age|experience|salary|
+----+----+----------+------+
|   B|   4|        10| 150.0|
|   D|null|        24| 123.0|
|   E|   8|      null| 300.2|
|null|   2|      null| 150.0|
+----+----+----------+------+



# Groupby and Aggregation

1. group by

In [20]:
df = spark.read.csv('test3.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [21]:
df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [25]:
df.groupBy('Name') # Does nothing as it needs to be combined with agg function
df.groupBy('Name').sum() # Note that sum is being appliede on salary but we didn't specify

DataFrame[Name: string, sum(salary): bigint]

In [24]:
df.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [27]:
df.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [28]:
df.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [45]:
df.groupBy('Departments').agg({ # Issue with using dict in agg is that you can't do 2 operations on Salary. ['sum','mean'] doesn't work
    'Salary': 'sum', # Putting sum in list or dict doesn't work
    'Departments': 'count'}).show()

+------------+-----------+------------------+
| Departments|sum(Salary)|count(Departments)|
+------------+-----------+------------------+
|         IOT|      15000|                 2|
|    Big Data|      15000|                 4|
|Data Science|      43000|                 4|
+------------+-----------+------------------+



In [46]:
from pyspark.sql import functions as F
df.groupBy('Departments').agg(F.sum('Salary').alias('sumSal'), F.mean('Salary').alias('meanSal')).show()

+------------+------+-------+
| Departments|sumSal|meanSal|
+------------+------+-------+
|         IOT| 15000| 7500.0|
|    Big Data| 15000| 3750.0|
|Data Science| 43000|10750.0|
+------------+------+-------+

