In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Read CSV Example")\
    .getOrCreate()

In [1]:
raw_users = [
("U001","Amit","29","Hyderabad","50000"),
("U002","Neha","Thirty Two","Delhi","62000"),
("U003","Ravi",None,"Bangalore","45k"),
("U004","Pooja","28","Mumbai",58000),
("U005",None,"31","Chennai","")
]

In [7]:
from pyspark.sql.types import(
    StructType,
    StructField,
    StringType,IntegerType,LongType
)
raw_schema=StructType([
    StructField("user_id",StringType(),nullable=False),
    StructField("name",StringType(),nullable=True),
    StructField("age",StringType(),nullable=True),
    StructField("city",StringType(),nullable=True),
    StructField("salary",StringType(),nullable=True)
])

In [11]:
df_raw=spark.createDataFrame(raw_users,schema=raw_schema)
df_raw.show()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U003| Ravi|      NULL|Bangalore|   45k|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+



In [14]:

from pyspark.sql.functions import expr, col
df_check = (
    df_raw
    .withColumn("age_int", expr("try_cast(age as int)"))
    .withColumn("salary_int", expr("try_cast(salary as int)"))
)
df_failed = df_check.filter(col("age_int").isNull() | col("salary_int").isNull())
df_failed.show()


+-------+----+----------+---------+------+-------+----------+
|user_id|name|       age|     city|salary|age_int|salary_int|
+-------+----+----------+---------+------+-------+----------+
|   U002|Neha|Thirty Two|    Delhi| 62000|   NULL|     62000|
|   U003|Ravi|      NULL|Bangalore|   45k|   NULL|      NULL|
|   U005|NULL|        31|  Chennai|      |     31|      NULL|
+-------+----+----------+---------+------+-------+----------+



In [16]:
from pyspark.sql.functions import coalesce,lit
df_raw.withColumn("name",coalesce(col("name"),lit("UNKNOWN"))).show()

+-------+-------+----------+---------+------+
|user_id|   name|       age|     city|salary|
+-------+-------+----------+---------+------+
|   U001|   Amit|        29|Hyderabad| 50000|
|   U002|   Neha|Thirty Two|    Delhi| 62000|
|   U003|   Ravi|      NULL|Bangalore|   45k|
|   U004|  Pooja|        28|   Mumbai| 58000|
|   U005|UNKNOWN|        31|  Chennai|      |
+-------+-------+----------+---------+------+



In [20]:
df_clean=df_raw.filter(col("age").isNotNull())
df_clean.show()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+



In [25]:
final_df=df_clean.select(
    "user_id","name","age","city","salary"
)
final_df.show()
final_df.printSchema()


+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)



In [26]:
raw_orders = [
("O001","U001","Laptop,Mobile,Tablet",75000),
("O002","U002",["Mobile","Tablet"],32000),
("O003","U003","Laptop",72000),
("O004","U004",None,25000),
("O005","U005","Laptop|Mobile",68000)
]

In [27]:
from pyspark.sql.types import(
    StructType,
    StructField,
    StringType,IntegerType,ArrayType
)

In [None]:
schema=StructTy

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import sum
spark=SparkSession.builder.getOrCreate()
data=[
    ("North", "2024-01-01", 100),
    ("North", "2024-01-02", 200),
    ("North", "2024-01-03", 300),
    ("South", "2024-01-01", 50),
    ("South", "2024-01-02", 150)
]
columns=["region","date","amount"]
df=spark.createDataFrame(data,columns)
df.show()

+------+----------+------+
|region|      date|amount|
+------+----------+------+
| North|2024-01-01|   100|
| North|2024-01-02|   200|
| North|2024-01-03|   300|
| South|2024-01-01|    50|
| South|2024-01-02|   150|
+------+----------+------+



In [29]:
df.groupBy("region").sum("amount").show()

+------+-----------+
|region|sum(amount)|
+------+-----------+
| North|        600|
| South|        200|
+------+-----------+



In [30]:
window_spec=Window.partitionBy("region")

In [32]:
df.withColumn(
    "region_total",
    sum("amount").over(window_spec)
).show()

+------+----------+------+------------+
|region|      date|amount|region_total|
+------+----------+------+------------+
| North|2024-01-01|   100|         600|
| North|2024-01-02|   200|         600|
| North|2024-01-03|   300|         600|
| South|2024-01-01|    50|         200|
| South|2024-01-02|   150|         200|
+------+----------+------+------------+

