In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
from pyspark.sql.functions import col,count

In [0]:
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("emp_name", StringType(), True),
    StructField("emp_gender", StringType(), True),
    StructField("emp_age", IntegerType(), True),
    StructField("emp_salary", IntegerType(), True),
    StructField("emp_manager", StringType(), True)
])

data = [
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (2, "Aarav Sharma", "Male", 28, 55000, "Zara Singh"),
    (3, "Zara Singh", "Female", 35, 70000, "Arjun Patel"),
    (4, "Priya Reddy", "Female", 32, 65000, "Aarav Sharma"),
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (6, "Naina Verma", "Female", 31, 72000, "Arjun Patel"),
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (4, "Priya Reddy", "Female", 32, 65000, "Aarav Sharma"),
    (5, "Aditya Kapoor", "Male", 28, 58000, "Zara Singh"),
    (10, "Anaya Joshi", "Female", 27, 59000, "Aarav Sharma"),
    (11, "Rohan Malhotra", "Male", 36, 73000, "Zara Singh"),
    (3, "Zara Singh", "Female", 35, 70000, "Arjun Patel")
]

In [0]:
emp_df = spark.createDataFrame(data,schema)

emp_df.show(5)

+------+------------+----------+-------+----------+------------+
|emp_id|    emp_name|emp_gender|emp_age|emp_salary| emp_manager|
+------+------------+----------+-------+----------+------------+
|     1| Arjun Patel|      Male|     30|     60000|Aarav Sharma|
|     2|Aarav Sharma|      Male|     28|     55000|  Zara Singh|
|     3|  Zara Singh|    Female|     35|     70000| Arjun Patel|
|     4| Priya Reddy|    Female|     32|     65000|Aarav Sharma|
|     1| Arjun Patel|      Male|     30|     60000|Aarav Sharma|
+------+------------+----------+-------+----------+------------+
only showing top 5 rows



In [0]:
emp_df.columns

Out[18]: ['emp_id', 'emp_name', 'emp_gender', 'emp_age', 'emp_salary', 'emp_manager']

In [0]:
#find duplicate using groupby
emp_df_group = emp_df.groupBy(emp_df.columns).count()

#filtering on count
emp_df_answer = emp_df_group.where(col("count")>1).drop(col("count"))

emp_df_answer.show()

+------+-----------+----------+-------+----------+------------+
|emp_id|   emp_name|emp_gender|emp_age|emp_salary| emp_manager|
+------+-----------+----------+-------+----------+------------+
|     1|Arjun Patel|      Male|     30|     60000|Aarav Sharma|
|     3| Zara Singh|    Female|     35|     70000| Arjun Patel|
|     4|Priya Reddy|    Female|     32|     65000|Aarav Sharma|
+------+-----------+----------+-------+----------+------------+



In [0]:
from pyspark.sql import Window

#find duplicate using window function
partition_ = Window.partitionBy(emp_df.columns)

In [0]:
#finding count of each partition by creating a window column

emp_df_count = emp_df.withColumn("count_",count("emp_id").over(partition_))
emp_df_count.show()

+------+--------------+----------+-------+----------+------------+------+
|emp_id|      emp_name|emp_gender|emp_age|emp_salary| emp_manager|count_|
+------+--------------+----------+-------+----------+------------+------+
|     1|   Arjun Patel|      Male|     30|     60000|Aarav Sharma|     3|
|     1|   Arjun Patel|      Male|     30|     60000|Aarav Sharma|     3|
|     1|   Arjun Patel|      Male|     30|     60000|Aarav Sharma|     3|
|     2|  Aarav Sharma|      Male|     28|     55000|  Zara Singh|     1|
|     3|    Zara Singh|    Female|     35|     70000| Arjun Patel|     2|
|     3|    Zara Singh|    Female|     35|     70000| Arjun Patel|     2|
|     4|   Priya Reddy|    Female|     32|     65000|Aarav Sharma|     2|
|     4|   Priya Reddy|    Female|     32|     65000|Aarav Sharma|     2|
|     5| Aditya Kapoor|      Male|     28|     58000|  Zara Singh|     1|
|     6|   Naina Verma|    Female|     31|     72000| Arjun Patel|     1|
|    10|   Anaya Joshi|    Female|    

In [0]:
#getting record where count is more than 1
df_answer = emp_df_count.filter(col("count_")>1)

#dropping duplicate cols
df_final_answer = df_answer.dropDuplicates().drop("count_")

df_final_answer.show()

+------+-----------+----------+-------+----------+------------+
|emp_id|   emp_name|emp_gender|emp_age|emp_salary| emp_manager|
+------+-----------+----------+-------+----------+------------+
|     1|Arjun Patel|      Male|     30|     60000|Aarav Sharma|
|     3| Zara Singh|    Female|     35|     70000| Arjun Patel|
|     4|Priya Reddy|    Female|     32|     65000|Aarav Sharma|
+------+-----------+----------+-------+----------+------------+

