In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Column ops').getOrCreate()

In [2]:
data = [
    ("U001", "Delhi", 450),
    ("U002", "Mumbai", 620),
    ("U003", "Bangalore", 300),
    ("U004", "Delhi", 700)
]
columns = ["user_id", "city", "amount"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+---------+------+
|user_id|     city|amount|
+-------+---------+------+
|   U001|    Delhi|   450|
|   U002|   Mumbai|   620|
|   U003|Bangalore|   300|
|   U004|    Delhi|   700|
+-------+---------+------+



In [4]:
from pyspark.sql.functions import col
df = df.withColumn("amount_with_tax", col("amount") * 1.18)
df.show()

+-------+---------+------+-----------------+
|user_id|     city|amount|  amount_with_tax|
+-------+---------+------+-----------------+
|   U001|    Delhi|   450|            531.0|
|   U002|   Mumbai|   620|731.5999999999999|
|   U003|Bangalore|   300|            354.0|
|   U004|    Delhi|   700|            826.0|
+-------+---------+------+-----------------+



In [5]:
df = df.withColumnRenamed("amount_with_tax", "total_amount")
df.show()

+-------+---------+------+-----------------+
|user_id|     city|amount|     total_amount|
+-------+---------+------+-----------------+
|   U001|    Delhi|   450|            531.0|
|   U002|   Mumbai|   620|731.5999999999999|
|   U003|Bangalore|   300|            354.0|
|   U004|    Delhi|   700|            826.0|
+-------+---------+------+-----------------+



In [6]:
df = df.replace("Delhi", "New Delhi", subset = ["city"])
df.show()

+-------+---------+------+-----------------+
|user_id|     city|amount|     total_amount|
+-------+---------+------+-----------------+
|   U001|New Delhi|   450|            531.0|
|   U002|   Mumbai|   620|731.5999999999999|
|   U003|Bangalore|   300|            354.0|
|   U004|New Delhi|   700|            826.0|
+-------+---------+------+-----------------+



In [7]:
from pyspark.sql.functions import when
df = df.withColumn(
    "amount_category",
    when(col("amount") >= 500, "High")
    .otherwise("Low")
)
df.show()

+-------+---------+------+-----------------+---------------+
|user_id|     city|amount|     total_amount|amount_category|
+-------+---------+------+-----------------+---------------+
|   U001|New Delhi|   450|            531.0|            Low|
|   U002|   Mumbai|   620|731.5999999999999|           High|
|   U003|Bangalore|   300|            354.0|            Low|
|   U004|New Delhi|   700|            826.0|           High|
+-------+---------+------+-----------------+---------------+

