In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
    .appName("PySpark Data Handling")\
    .getOrCreate()

In [0]:
# Sample data

data1=[
    (1,"John Doe","Bangalore","2023-01-15","152.75","True"),
    (2,"Jane Smith","Delhi","2023-05-20","89.50","False"),
    (3,"Robert Brown","Mumbai","InvalidDate","200.00","True"),
    (4,"Linda White","Kolkata","2023-02-29",None,"yes"),
    (5,"Mike Green","Chennai","2023-08-10","NaN","1"),
    (6,"Sarah Blue","Hyderabad","InvalidDate","300.40","No"),
]

columns=["id","name","city","date","amount","is_active"]

df=spark.createDataFrame(data1,schema=columns)

df.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+------------+---------+-----------+------+---------+



In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



In [0]:
# handle Integer Column

df.filter(df.id>3).show()

+---+-----------+---------+-----------+------+---------+
| id|       name|     city|       date|amount|is_active|
+---+-----------+---------+-----------+------+---------+
|  4|Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5| Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6| Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+-----------+---------+-----------+------+---------+



In [0]:
df.withColumn('id_double',df.id+2).show()

+---+------------+---------+-----------+------+---------+---------+
| id|        name|     city|       date|amount|is_active|id_double|
+---+------------+---------+-----------+------+---------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|        3|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|        4|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|        5|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|        6|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|        7|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|        8|
+---+------------+---------+-----------+------+---------+---------+



In [0]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df=df.withColumn('id',col('id').cast(IntegerType()))

In [0]:
df.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+------------+---------+-----------+------+---------+



In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



In [0]:
# String Columns

from pyspark.sql.functions import *

df=df.withColumn('name_upper',upper(df.name))
df.show()

+---+------------+---------+-----------+------+---------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|
+---+------------+---------+-----------+------+---------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|  JANE SMITH|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|  SARAH BLUE|
+---+------------+---------+-----------+------+---------+------------+



In [0]:
df.filter(df.city.startswith('B')).show()

+---+--------+---------+----------+------+---------+----------+
| id|    name|     city|      date|amount|is_active|name_upper|
+---+--------+---------+----------+------+---------+----------+
|  1|John Doe|Bangalore|2023-01-15|152.75|     True|  JOHN DOE|
+---+--------+---------+----------+------+---------+----------+



In [0]:


from pyspark.sql.functions import *

df=df.withColumn('name_lower',lower(df.name))
df.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



In [0]:
df=df.withColumn('amount',col('amount').cast('float'))
df.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- is_active: string (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- name_lower: string (nullable = true)



In [0]:
df_filled=df.fillna({'amount':0})
df_filled.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|   0.0|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   0.0|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



In [0]:
# Handle Date Column

csv_data=""" id,date_iso,data_dmy,

"""
