Todays's Topic
----------------
1. How to find unique rows?
2. How to drop duplicates rows?
3. How to sort the data in ascending and descending order?
4. One simple question of Pyspark.

In [None]:
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(15 ,'Mohit',45000,  18),
(13 ,'Nidhi',60000,  17),      
(14 ,'Priya',90000,  18),  
(18 ,'Sam',65000,   17)
]

schema = ['id','Name','sal','mgr_id']

manager_df = spark.createDataFrame(data = data,schema = schema)


In [None]:
manager_df.show(truncate=False)

+---+------+-----+------+
|id |Name  |sal  |mgr_id|
+---+------+-----+------+
|10 |Anil  |50000|18    |
|11 |Vikas |75000|16    |
|12 |Nisha |40000|18    |
|13 |Nidhi |60000|17    |
|14 |Priya |80000|18    |
|15 |Mohit |45000|18    |
|16 |Rajesh|90000|10    |
|17 |Raman |55000|16    |
|18 |Sam   |65000|17    |
|15 |Mohit |45000|18    |
|13 |Nidhi |60000|17    |
|14 |Priya |90000|18    |
|18 |Sam   |65000|17    |
+---+------+-----+------+



In [None]:
# Showing distinct columns while considering all the columns to determine duplicates rows
manager_df.distinct().show()

+---+------+-----+------+
| id|  Name|  sal|mgr_id|
+---+------+-----+------+
| 10|  Anil|50000|    18|
| 12| Nisha|40000|    18|
| 11| Vikas|75000|    16|
| 13| Nidhi|60000|    17|
| 15| Mohit|45000|    18|
| 14| Priya|80000|    18|
| 16|Rajesh|90000|    10|
| 17| Raman|55000|    16|
| 18|   Sam|65000|    17|
| 14| Priya|90000|    18|
+---+------+-----+------+



In [None]:
# Count number of records before applying distinct
manager_df.count()

Out[9]: 13

In [None]:
# Count number of records after applying distinct
manager_df.distinct().count()

Out[44]: 10

In [None]:
# Applying distinct on specified columns
manager_df.select("id","name").distinct().show()

+---+------+
| id|  name|
+---+------+
| 10|  Anil|
| 11| Vikas|
| 12| Nisha|
| 13| Nidhi|
| 15| Mohit|
| 14| Priya|
| 17| Raman|
| 16|Rajesh|
| 18|   Sam|
+---+------+



In [None]:
# Droping some specified duplicates column from the dataframe and storing it in another df
drop_duplicate_manager_df = manager_df.drop_duplicates(["id","name","sal","mgr_id"])

In [None]:
drop_duplicate_manager_df.show()

+---+------+-----+------+
| id|  Name|  sal|mgr_id|
+---+------+-----+------+
| 10|  Anil|50000|    18|
| 12| Nisha|40000|    18|
| 11| Vikas|75000|    16|
| 13| Nidhi|60000|    17|
| 15| Mohit|45000|    18|
| 14| Priya|80000|    18|
| 16|Rajesh|90000|    10|
| 17| Raman|55000|    16|
| 18|   Sam|65000|    17|
| 14| Priya|90000|    18|
+---+------+-----+------+



In [None]:
# Sort the record based on specified column
from pyspark.sql.functions import * 
manager_df.sort(col("sal")).show()

+---+------+-----+------+
| id|  Name|  sal|mgr_id|
+---+------+-----+------+
| 12| Nisha|40000|    18|
| 15| Mohit|45000|    18|
| 15| Mohit|45000|    18|
| 10|  Anil|50000|    18|
| 17| Raman|55000|    16|
| 13| Nidhi|60000|    17|
| 13| Nidhi|60000|    17|
| 18|   Sam|65000|    17|
| 18|   Sam|65000|    17|
| 11| Vikas|75000|    16|
| 14| Priya|80000|    18|
| 14| Priya|90000|    18|
| 16|Rajesh|90000|    10|
+---+------+-----+------+



In [None]:
# Perform sorting operation in descending order on specified column

manager_df.sort(col("sal").desc(),col("name").desc()).show()

+---+------+-----+------+
| id|  Name|  sal|mgr_id|
+---+------+-----+------+
| 16|Rajesh|90000|    10|
| 14| Priya|90000|    18|
| 14| Priya|80000|    18|
| 11| Vikas|75000|    16|
| 18|   Sam|65000|    17|
| 18|   Sam|65000|    17|
| 13| Nidhi|60000|    17|
| 13| Nidhi|60000|    17|
| 17| Raman|55000|    16|
| 10|  Anil|50000|    18|
| 15| Mohit|45000|    18|
| 15| Mohit|45000|    18|
| 12| Nisha|40000|    18|
+---+------+-----+------+



In [None]:
#Leet Code problem
leet_code_data = [
    (1, 'Will', None),
    (2, 'Jane', None),
    (3, 'Alex', 2),
    (4, 'Bill', None),
    (5, 'Zack', 1),
    (6, 'Mark', 2)
]
schema_leetcode = ['id','name','referee_id']

customer_df = spark.createDataFrame(data = leet_code_data,schema = schema_leetcode)

In [None]:
customer_df.show()

+---+----+----------+
| id|name|referee_id|
+---+----+----------+
|  1|Will|      null|
|  2|Jane|      null|
|  3|Alex|         2|
|  4|Bill|      null|
|  5|Zack|         1|
|  6|Mark|         2|
+---+----+----------+



In [None]:
# Find the customer name who are not referred by id 2
# Using expr
customer_df.where(expr("referee_id !=2 or referee_id is null")).select("name").show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



In [None]:
# using col method and filter method
customer_df.filter((col("referee_id") != 2) | (col("referee_id").isNull())).select("name").show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



In [None]:
customer_df.select("name").filter((col("referee_id")!=2)|(col("referee_id").isNull())).show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



In [None]:
# Another approach to solve the above problem
customer_df.withColumn("referee",when(col("referee_id").isNull(),"Yes")\
                                .otherwise("No")).filter((col("referee_id")!=2)|(col("referee")=='Yes')).select("name").show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+



In [None]:
# Final approach to solve the above problem
customer_df.withColumn("referee",when(col("referee_id").isNull(),0)\
                                .otherwise(col("referee_id"))).filter(col("referee")!=2).select("name").show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+

