In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("local[3]") \
            .appName("Misc Transformations") \
            .getOrCreate()

data_list = [("Ravi", "28", "1", "2002"),
             ("Abdul", "23", "5", "81"),
             ("John", "12", "12", "6"),
             ("Rosy", "7", "8", "63"),
             ("Abdul", "23", "5", "81")
            ]

data_list1 = [("Ravi", 28, 1, 2002),
             ("Abdul", 23, 5, 81),
             ("John", 12, 12, 6),
             ("Rosy", 7, 8, 63),
             ("Abdul", 23, 5, 81)
            ]

1. Quick method to create dataframe

In [2]:
raw_df = spark.createDataFrame(data_list)
raw_df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)



However we infered schema automatically. so might incorrect for some columns. as well as we dont have meaningful column names. and we do have duplicate entries as well.

so how to get schema for dataframe namedtuple right. so define it and attach schema.

however we have quick method to do it use toDF method.

In [3]:
raw_df = spark.createDataFrame(data_list).toDF("name", "day", "month", "year")
raw_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)



2. How to add column with monotonically increasing id
withColumn can allow to create column as well. monotonically_increasing_id generates unique integer for every record

In [4]:
df1 = raw_df.withColumn("id", monotonically_increasing_id())
df1.show()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|  81| 8589934592|
| John| 12|   12|   6| 8589934593|
| Rosy|  7|    8|  63|17179869184|
|Abdul| 23|    5|  81|17179869185|
+-----+---+-----+----+-----------+



3. How to use use case when Then
these are popular constructs in programming language
will use to avoid lengthy if else statements
lets use it to fix year digit problem. we have two digit year make it four digit.

In [5]:
df2 = df1.withColumn("year", expr("""
         case when year < 21 then year + 2000
         when year < 100 then year + 1900
         else year
         end"""))
df2.show()

+-----+---+-----+------+-----------+
| name|day|month|  year|         id|
+-----+---+-----+------+-----------+
| Ravi| 28|    1|  2002|          0|
|Abdul| 23|    5|1981.0| 8589934592|
| John| 12|   12|2006.0| 8589934593|
| Rosy|  7|    8|1963.0|17179869184|
|Abdul| 23|    5|1981.0|17179869185|
+-----+---+-----+------+-----------+



now year in decimal. cause datatye is in string and we are performing arithmetic operation so promoted to decimal and then again after demoted to string.

How to fix it?

3. How to cast your fields
two methods
inline cast: doesnot allow spark to promote and demote field automatically;

In [6]:
df3 = df1.withColumn("year", expr("""
         case when year < 21 then cast(year as int) + 2000
         when year < 100 then cast(year as int) + 1900
         else year
         end"""))
df3.show()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006| 8589934593|
| Rosy|  7|    8|1963|17179869184|
|Abdul| 23|    5|1981|17179869185|
+-----+---+-----+----+-----------+



Change Schema Method

In [7]:
df4 = df1.withColumn("year", expr("""
         case when year < 21 then year + 2000
         when year < 100 then year + 1900
         else year
         end""").cast(IntegerType()))
df4.show()
df4.printSchema()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006| 8589934593|
| Rosy|  7|    8|1963|17179869184|
|Abdul| 23|    5|1981|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)



In [8]:
df1.show()
df1.printSchema()

df5 = df1.withColumn("day", col("day").cast(IntegerType())) \
         .withColumn("month", col("month").cast(IntegerType())) \
         .withColumn("year", col("year").cast(IntegerType())) 

df5.printSchema()

df6 = df5.withColumn("year", expr("""
         case when year < 21 then year + 2000
         when year < 100 then year + 1900
         else year
         end"""))
df6.show()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|  81| 8589934592|
| John| 12|   12|   6| 8589934593|
| Rosy|  7|    8|  63|17179869184|
|Abdul| 23|    5|  81|17179869185|
+-----+---+-----+----+-----------+

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)
 |-- id: long (nullable = false)

root
 |-- name: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- id: long (nullable = false)

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006| 8589934593|
| Rosy|  7|    8|1963|17179869184|
|Abdul| 23|    5|1981|17179869185|
+-----+---+-----+----+-----------+


3. Alternative to case expression
column object expression for case expression.

In [9]:
df7 = df5.withColumn("year", \
                    when(col("year") < 21, col("year") + 2000) \
                    .when(col("year") < 100, col("year") + 1900) \
                    .otherwise(col("year")))
df7.show()

+-----+---+-----+----+-----------+
| name|day|month|year|         id|
+-----+---+-----+----+-----------+
| Ravi| 28|    1|2002|          0|
|Abdul| 23|    5|1981| 8589934592|
| John| 12|   12|2006| 8589934593|
| Rosy|  7|    8|1963|17179869184|
|Abdul| 23|    5|1981|17179869185|
+-----+---+-----+----+-----------+



4. Add remove column and duplicates

In [10]:
#use inside sql expression
df8 = df7.withColumn("dob", expr("to_date(concat(day,'/',month,'/',year), 'd/M/y')"))
df8.show()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Ravi| 28|    1|2002|          0|2002-01-28|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| John| 12|   12|2006| 8589934593|2006-12-12|
| Rosy|  7|    8|1963|17179869184|1963-08-07|
|Abdul| 23|    5|1981|17179869185|1981-05-23|
+-----+---+-----+----+-----------+----------+



In [11]:
#or use it on column
df9 = df7.withColumn("dob", to_date(expr("concat(day,'/',month,'/',year)"), 'd/M/y'))
df9.show()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Ravi| 28|    1|2002|          0|2002-01-28|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| John| 12|   12|2006| 8589934593|2006-12-12|
| Rosy|  7|    8|1963|17179869184|1963-08-07|
|Abdul| 23|    5|1981|17179869185|1981-05-23|
+-----+---+-----+----+-----------+----------+



In [12]:
# now day month year field useless for me. can we drop them
df10 = df9.drop("day", "month", "year")
df10.show()

+-----+-----------+----------+
| name|         id|       dob|
+-----+-----------+----------+
| Ravi|          0|2002-01-28|
|Abdul| 8589934592|1981-05-23|
| John| 8589934593|2006-12-12|
| Rosy|17179869184|1963-08-07|
|Abdul|17179869185|1981-05-23|
+-----+-----------+----------+



In [13]:
df10 = df9.dropDuplicates(["name", "dob"])
df10.show()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Rosy|  7|    8|1963|17179869184|1963-08-07|
| Ravi| 28|    1|2002|          0|2002-01-28|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| John| 12|   12|2006| 8589934593|2006-12-12|
+-----+---+-----+----+-----------+----------+



In [14]:
# sort using dob in desc (as by default it is in asceneding )
df11=df10.sort(expr("dob desc"))
df11.show()

+-----+---+-----+----+-----------+----------+
| name|day|month|year|         id|       dob|
+-----+---+-----+----+-----------+----------+
| Rosy|  7|    8|1963|17179869184|1963-08-07|
|Abdul| 23|    5|1981| 8589934592|1981-05-23|
| Ravi| 28|    1|2002|          0|2002-01-28|
| John| 12|   12|2006| 8589934593|2006-12-12|
+-----+---+-----+----+-----------+----------+



In [15]:
spark.stop()