Transform in dataframe

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("transform").getOrCreate()

In [2]:
# Sample data for DataFrame 1
data1 = [("Alice", 3000, "HR"),
         ("Bob", 4000, "Finance"),
         ("Charlie", 5000, "IT")]

# Creating DataFrames
df = spark.createDataFrame(data1, ["name", "salary", "department"])

df.show()

+-------+------+----------+
|   name|salary|department|
+-------+------+----------+
|  Alice|  3000|        HR|
|    Bob|  4000|   Finance|
|Charlie|  5000|        IT|
+-------+------+----------+



In [5]:
from pyspark.sql.functions import upper

def convert_upper(df):
    return df.withColumn('name',upper(df.name))

def double_salary(df):
    return df.withColumn('salary',df.salary*2)

df_transform = df.transform(convert_upper)\
                .transform(double_salary)

df_transform.show()



+-------+------+----------+
|   name|salary|department|
+-------+------+----------+
|  ALICE|  6000|        HR|
|    BOB|  8000|   Finance|
|CHARLIE| 10000|        IT|
+-------+------+----------+



TRANFORMATION IN SQL FUNCTIONS only applies to arrays

In [6]:
data = [("John", [1, 2, 3]), ("Jane", [4, 5, 6]), ("Jake", [7, 8, 9])]
columns = ["name", "numbers"]

df = spark.createDataFrame(data, columns)

df.show()

+----+---------+
|name|  numbers|
+----+---------+
|John|[1, 2, 3]|
|Jane|[4, 5, 6]|
|Jake|[7, 8, 9]|
+----+---------+



In [7]:
from pyspark.sql.functions import expr,transform

df_transformed = df.withColumn("numbers_transformed", expr("transform(numbers, x -> x + 1)"))

df_transformed.show()


+----+---------+-------------------+
|name|  numbers|numbers_transformed|
+----+---------+-------------------+
|John|[1, 2, 3]|          [2, 3, 4]|
|Jane|[4, 5, 6]|          [5, 6, 7]|
|Jake|[7, 8, 9]|         [8, 9, 10]|
+----+---------+-------------------+



In [8]:
df_trans_double = df.select('name',transform('numbers',lambda x : x*2).alias('double_number'))
df_trans_double.show()

+----+-------------+
|name|double_number|
+----+-------------+
|John|    [2, 4, 6]|
|Jane|  [8, 10, 12]|
|Jake| [14, 16, 18]|
+----+-------------+

