In [None]:
from pyspark.sql.functions import col, lit

# File location and type
file_location = "/FileStore/tables/Data.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","


# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.printSchema()
df.show()

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
+---+-----+---+



In [None]:
# Adding duplicate data in DF

df1 = spark.createDataFrame([[1, 'Nitin', 25],])
df = df.union(df1)

df.show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
|  1|Nitin| 25|
+---+-----+---+



#### 1. Distinct()

In [None]:
# We cannot pass parameters in Distinct
df1 = df.distinct()

df1.show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
+---+-----+---+



#### 2. DropDuplicate()

In [None]:
df1 = spark.createDataFrame([[1, 'Nitin', 25],])
df = df.union(df1)

df.show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
|  1|Nitin| 25|
|  1|Nitin| 25|
+---+-----+---+



In [None]:
# Removing duplicates

df.dropDuplicates().show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
+---+-----+---+



In [None]:
# We can pass parameters on which basis we want to remove duplicates

df1 = spark.createDataFrame([[1, 'Nitin', 25],[1, 'Nitin', 30]])
df = df.union(df1)

df.show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
|  1|Nitin| 25|
|  1|Nitin| 25|
|  1|Nitin| 25|
|  1|Nitin| 30|
+---+-----+---+



In [None]:
df.dropDuplicates(['Id', 'Name']).show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  2|Surya| 26|
|  3| Ravi| 23|
|  1|Nitin| 25|
+---+-----+---+



In [None]:
df.dropDuplicates(['Id', 'Name', 'Age']).show()

+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|  1|Nitin| 25|
|  2|Surya| 26|
|  3| Ravi| 23|
|  1|Nitin| 30|
+---+-----+---+

