In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName('DataTypes in Pyspark').getOrCreate()

25/06/10 16:05:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
data = [
    (1,"John Doe","Bangalore","2023-01-15","152.725",True),
    (2,"Jane Smith","Delhi","2024-01-15","172.725",True),
    (3,"John Depp","Mumbai","2023-06-15","122.725",True),
    (4,"Dharambeer Das","Kolkata","2023-07-15","132.725",True),
    (5,"Nishtha Pal","Ranchi","2023-01-17","158.725",True),
    (6,"Bharathi Vidya","Chennai","2023-04-25","142.725",True),
    (7,"Yogesh Raj","kocchi","2023-05-25","157.725",True),
    (8,"Soujanya Bhatt","Pune","2023-09-25","158.725",True),
    (9,"Sourabh Joshi","Hyderbad","2024-11-15","159.725",True),
]

columns = ["id","name","city","date","amount","is_active"]

df = spark.createDataFrame(data,schema=columns)

df.show()

                                                                                

+---+--------------+---------+----------+-------+---------+
| id|          name|     city|      date| amount|is_active|
+---+--------------+---------+----------+-------+---------+
|  1|      John Doe|Bangalore|2023-01-15|152.725|     true|
|  2|    Jane Smith|    Delhi|2024-01-15|172.725|     true|
|  3|     John Depp|   Mumbai|2023-06-15|122.725|     true|
|  4|Dharambeer Das|  Kolkata|2023-07-15|132.725|     true|
|  5|   Nishtha Pal|   Ranchi|2023-01-17|158.725|     true|
|  6|Bharathi Vidya|  Chennai|2023-04-25|142.725|     true|
|  7|    Yogesh Raj|   kocchi|2023-05-25|157.725|     true|
|  8|Soujanya Bhatt|     Pune|2023-09-25|158.725|     true|
|  9| Sourabh Joshi| Hyderbad|2024-11-15|159.725|     true|
+---+--------------+---------+----------+-------+---------+



In [4]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [5]:
# How to handling integer column
from pyspark.sql.types import *
from pyspark.sql.functions import col

df = df.withColumn('id' , col('id').cast(IntegerType()))

In [6]:
df.show()

+---+--------------+---------+----------+-------+---------+
| id|          name|     city|      date| amount|is_active|
+---+--------------+---------+----------+-------+---------+
|  1|      John Doe|Bangalore|2023-01-15|152.725|     true|
|  2|    Jane Smith|    Delhi|2024-01-15|172.725|     true|
|  3|     John Depp|   Mumbai|2023-06-15|122.725|     true|
|  4|Dharambeer Das|  Kolkata|2023-07-15|132.725|     true|
|  5|   Nishtha Pal|   Ranchi|2023-01-17|158.725|     true|
|  6|Bharathi Vidya|  Chennai|2023-04-25|142.725|     true|
|  7|    Yogesh Raj|   kocchi|2023-05-25|157.725|     true|
|  8|Soujanya Bhatt|     Pune|2023-09-25|158.725|     true|
|  9| Sourabh Joshi| Hyderbad|2024-11-15|159.725|     true|
+---+--------------+---------+----------+-------+---------+



In [7]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [8]:
# String COlumns
from pyspark.sql.functions import *
df = df.withColumn('name_upper',upper(df.name))
df.show()

+---+--------------+---------+----------+-------+---------+--------------+
| id|          name|     city|      date| amount|is_active|    name_upper|
+---+--------------+---------+----------+-------+---------+--------------+
|  1|      John Doe|Bangalore|2023-01-15|152.725|     true|      JOHN DOE|
|  2|    Jane Smith|    Delhi|2024-01-15|172.725|     true|    JANE SMITH|
|  3|     John Depp|   Mumbai|2023-06-15|122.725|     true|     JOHN DEPP|
|  4|Dharambeer Das|  Kolkata|2023-07-15|132.725|     true|DHARAMBEER DAS|
|  5|   Nishtha Pal|   Ranchi|2023-01-17|158.725|     true|   NISHTHA PAL|
|  6|Bharathi Vidya|  Chennai|2023-04-25|142.725|     true|BHARATHI VIDYA|
|  7|    Yogesh Raj|   kocchi|2023-05-25|157.725|     true|    YOGESH RAJ|
|  8|Soujanya Bhatt|     Pune|2023-09-25|158.725|     true|SOUJANYA BHATT|
|  9| Sourabh Joshi| Hyderbad|2024-11-15|159.725|     true| SOURABH JOSHI|
+---+--------------+---------+----------+-------+---------+--------------+



In [9]:
df.filter(df.city.startswith('B')).show()

+---+--------+---------+----------+-------+---------+----------+
| id|    name|     city|      date| amount|is_active|name_upper|
+---+--------+---------+----------+-------+---------+----------+
|  1|John Doe|Bangalore|2023-01-15|152.725|     true|  JOHN DOE|
+---+--------+---------+----------+-------+---------+----------+



In [10]:
# handle float column
df = df.withColumn('amount',col('amount').cast('float'))
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- name_upper: string (nullable = true)

+---+--------------+---------+----------+-------+---------+--------------+
| id|          name|     city|      date| amount|is_active|    name_upper|
+---+--------------+---------+----------+-------+---------+--------------+
|  1|      John Doe|Bangalore|2023-01-15|152.725|     true|      JOHN DOE|
|  2|    Jane Smith|    Delhi|2024-01-15|172.725|     true|    JANE SMITH|
|  3|     John Depp|   Mumbai|2023-06-15|122.725|     true|     JOHN DEPP|
|  4|Dharambeer Das|  Kolkata|2023-07-15|132.725|     true|DHARAMBEER DAS|
|  5|   Nishtha Pal|   Ranchi|2023-01-17|158.725|     true|   NISHTHA PAL|
|  6|Bharathi Vidya|  Chennai|2023-04-25|142.725|     true|BHARATHI VIDYA|
|  7|    Yogesh Raj|   kocchi|2023-05-25|157.725|     t

### Handling Date datatypes

In [19]:
data = [
    (1, "2025-06-10", "10/06/2025", "06/10/2025", "2025-06-10 15:42:17"),
    (2, "2025-07-15", "15/07/2025", "07/15/2025", "2025-07-15 08:30:00"),
    (3, "2025-08-20", "20/08/2025", "08/20/2025", "2025-08-20 22:11:45"),
    (4, "2025-13-40", "40/13/2025", "13/40/2025", "2025-13-40 13:05:30"),  # Invalid date
    (5, "2025-02-30", "30/02/2025", "02/30/2025", "2025-02-30 19:55:10")   # Invalid date
]

columns = ["id","date_iso","date_dmy","date_mdy","timestamp"]

df = spark.createDataFrame(data,schema=columns)

df.show()

+---+----------+----------+----------+-------------------+
| id|  date_iso|  date_dmy|  date_mdy|          timestamp|
+---+----------+----------+----------+-------------------+
|  1|2025-06-10|10/06/2025|06/10/2025|2025-06-10 15:42:17|
|  2|2025-07-15|15/07/2025|07/15/2025|2025-07-15 08:30:00|
|  3|2025-08-20|20/08/2025|08/20/2025|2025-08-20 22:11:45|
|  4|2025-13-40|40/13/2025|13/40/2025|2025-13-40 13:05:30|
|  5|2025-02-30|30/02/2025|02/30/2025|2025-02-30 19:55:10|
+---+----------+----------+----------+-------------------+



In [20]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date_iso: string (nullable = true)
 |-- date_dmy: string (nullable = true)
 |-- date_mdy: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [21]:
df = df\
    .withColumn('parsed_date_iso', to_date(df.date_iso,'yyyy-mm-dd')) \
    .withColumn('parsed_date_dmy', to_date(df.date_dmy,'dd/mm/yyyy')) \
    .withColumn('parsed_date_mdy', to_date(df.date_mdy,'mm/dd/yyyy'))

In [22]:
df.show()

+---+----------+----------+----------+-------------------+---------------+---------------+---------------+
| id|  date_iso|  date_dmy|  date_mdy|          timestamp|parsed_date_iso|parsed_date_dmy|parsed_date_mdy|
+---+----------+----------+----------+-------------------+---------------+---------------+---------------+
|  1|2025-06-10|10/06/2025|06/10/2025|2025-06-10 15:42:17|     2025-01-10|     2025-01-10|     2025-01-10|
|  2|2025-07-15|15/07/2025|07/15/2025|2025-07-15 08:30:00|     2025-01-15|     2025-01-15|     2025-01-15|
|  3|2025-08-20|20/08/2025|08/20/2025|2025-08-20 22:11:45|     2025-01-20|     2025-01-20|     2025-01-20|
|  4|2025-13-40|40/13/2025|13/40/2025|2025-13-40 13:05:30|           NULL|           NULL|           NULL|
|  5|2025-02-30|30/02/2025|02/30/2025|2025-02-30 19:55:10|     2025-01-30|     2025-01-30|     2025-01-30|
+---+----------+----------+----------+-------------------+---------------+---------------+---------------+



In [24]:
# Handling timestamp
from pyspark.sql.functions import to_timestamp,year,dayofmonth , hour , minute

In [25]:
df = df.withColumn('parsed_timestamp',to_timestamp(df.timestamp))
df.show()
df.printSchema()

+---+----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+
| id|  date_iso|  date_dmy|  date_mdy|          timestamp|parsed_date_iso|parsed_date_dmy|parsed_date_mdy|   parsed_timestamp|
+---+----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+
|  1|2025-06-10|10/06/2025|06/10/2025|2025-06-10 15:42:17|     2025-01-10|     2025-01-10|     2025-01-10|2025-06-10 15:42:17|
|  2|2025-07-15|15/07/2025|07/15/2025|2025-07-15 08:30:00|     2025-01-15|     2025-01-15|     2025-01-15|2025-07-15 08:30:00|
|  3|2025-08-20|20/08/2025|08/20/2025|2025-08-20 22:11:45|     2025-01-20|     2025-01-20|     2025-01-20|2025-08-20 22:11:45|
|  4|2025-13-40|40/13/2025|13/40/2025|2025-13-40 13:05:30|           NULL|           NULL|           NULL|               NULL|
|  5|2025-02-30|30/02/2025|02/30/2025|2025-02-30 19:55:10|     2025-01-30|     2025-01-30|     2025-01-30|     

In [27]:
df = df.withColumn('year',year(df.parsed_timestamp))\
.withColumn('month',dayofmonth(df.parsed_timestamp))\
.withColumn('day',day(df.parsed_timestamp))\
.withColumn('hour',hour(df.parsed_timestamp))\
.withColumn('minute',minute(df.parsed_timestamp))

In [28]:
df.show()

+---+----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+----+-----+----+----+------+
| id|  date_iso|  date_dmy|  date_mdy|          timestamp|parsed_date_iso|parsed_date_dmy|parsed_date_mdy|   parsed_timestamp|year|month| day|hour|minute|
+---+----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+----+-----+----+----+------+
|  1|2025-06-10|10/06/2025|06/10/2025|2025-06-10 15:42:17|     2025-01-10|     2025-01-10|     2025-01-10|2025-06-10 15:42:17|2025|   10|  10|  15|    42|
|  2|2025-07-15|15/07/2025|07/15/2025|2025-07-15 08:30:00|     2025-01-15|     2025-01-15|     2025-01-15|2025-07-15 08:30:00|2025|   15|  15|   8|    30|
|  3|2025-08-20|20/08/2025|08/20/2025|2025-08-20 22:11:45|     2025-01-20|     2025-01-20|     2025-01-20|2025-08-20 22:11:45|2025|   20|  20|  22|    11|
|  4|2025-13-40|40/13/2025|13/40/2025|2025-13-40 13:05:30|           N

In [30]:
from pyspark.sql.functions import datediff
df = df.withColumn('days_difference', datediff(df.parsed_date_dmy,df.parsed_date_iso))

df.select('parsed_date_dmy','parsed_date_iso','days_difference').show()

+---------------+---------------+---------------+
|parsed_date_dmy|parsed_date_iso|days_difference|
+---------------+---------------+---------------+
|     2025-01-10|     2025-01-10|              0|
|     2025-01-15|     2025-01-15|              0|
|     2025-01-20|     2025-01-20|              0|
|           NULL|           NULL|           NULL|
|     2025-01-30|     2025-01-30|              0|
+---------------+---------------+---------------+



In [31]:
spark.stop()