In [0]:
#importing library 
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [0]:
# Create a Spark session and DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples').getOrCreate()
'''
df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()
'''


Out[24]: '\ndf = spark.createDataFrame(data = dataDF, schema = schema)\ndf.printSchema()\n'

In [0]:
# Define the schema with nested structure

schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('dob', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

In [0]:
# Example Data
dataDF = [
    (('James', '', 'Smith'), '1991-04-01', 'M', 3000),
    (('Michael', 'Rose', ''), '2000-05-19', 'M', 4000),
    (('Robert', '', 'Williams'), '1978-09-05', 'M', 4000),
    (('Maria', 'Anne', 'Jones'), '1967-12-01', 'F', 4000),
    (('Jen', 'Mary', 'Brown'), '1980-02-17', 'F', -1)
]

In [0]:
 #DataFrame
df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:
df.show()
'''
The df.show() method in PySpark displays the content of a DataFrame in a tabular format. By default, it shows up to 20 rows of the DataFrame and truncates strings longer than 20 characters
'''

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+

Out[28]: '\nThe df.show() method in PySpark displays the content of a DataFrame in a tabular format. By default, it shows up to 20 rows of the DataFrame and truncates strings longer than 20 characters\n'

In [0]:
#usecase1. PySpark withColumnRenamed – To rename a DataFrame column name
df.withColumnRenamed("dob", "DateOfBirth").printSchema()
df.show(truncate=True)#byfalut show 20 charts


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [0]:
# UseCase2. PySpark withColumnRenamed – To rename multiple columns
df2 = df.withColumnRenamed("dob", "DateOfBirth") \
    .withColumnRenamed("salary", "salary_amount")
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)



In [0]:
# UseCase3 Using PySpark StructType – To rename a nested column in Dataframe
Changing a column name on nested data is not straight forward and we can do this by creating a new schema with new DataFrame columns using StructType and use it using cast function as shown below.

In [0]:
# UseCase3. Using PySpark StructType – To rename a nested column in DataFrame
#from pyspark.sql.functions import col

# Define the new schema for the nested structure
schema2 = StructType([
    StructField("fname", StringType(), True),
    StructField("middlename", StringType(), True),
    StructField("lname", StringType(), True)
])

# Select columns, casting the nested 'name' field to the new schema
df.select(col("name").cast(schema2), col("dob"), col("gender"), col("salary")).printSchema()

''' df.select(col("name").cast(schema2), \
     col("dob"), col("gender"),col("salary")) \
   .printSchema()  '''


root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

Out[32]: ' df.select(col("name").cast(schema2),      col("dob"), col("gender"),col("salary"))    .printSchema()  '

In [0]:
# UseCase4. Using Select – To rename nested elements
df.select(col("name.firstname").alias("fname"), 
          col("name.middlename").alias("mname"), 
          col("name.lastname").alias("lname"), 
          col("dob"), col("gender"), col("salary")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:
# UseCase5. Using PySpark DataFrame withColumn – To rename nested columns
When you have nested columns on PySpark DatFrame and if you want to rename it, use withColumn on a data frame object to create a new column from an existing and we will need to drop the existing column. Below example creates a “fname” column from “name.firstname” and drops the “name” column

In [0]:
# 5. Using PySpark DataFrame withColumn – To rename nested columns
df4 = df.withColumn("fname", col("name.firstname")) \
        .withColumn("mname", col("name.middlename")) \
        .withColumn("lname", col("name.lastname")) \
        .drop("name")
df4.printSchema()
df4.show()

root
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)

+----------+------+------+-------+-----+--------+
|       dob|gender|salary|  fname|mname|   lname|
+----------+------+------+-------+-----+--------+
|1991-04-01|     M|  3000|  James|     |   Smith|
|2000-05-19|     M|  4000|Michael| Rose|        |
|1978-09-05|     M|  4000| Robert|     |Williams|
|1967-12-01|     F|  4000|  Maria| Anne|   Jones|
|1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+----------+------+------+-------+-----+--------+



In [0]:
# 6. Using col() function – To Dynamically rename all or multiple columns
newColumns = [col("name.firstname").alias("fname"),
              col("name.middlename").alias("mname"),
              col("name.lastname").alias("lname"),
              col("dob").alias("DateOfBirth"),
              col("gender").alias("sex"),
              col("salary").alias("income")]
df6 = df.select(*newColumns)
df6.printSchema()
df4.show()

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- income: integer (nullable = true)

+----------+------+------+-------+-----+--------+
|       dob|gender|salary|  fname|mname|   lname|
+----------+------+------+-------+-----+--------+
|1991-04-01|     M|  3000|  James|     |   Smith|
|2000-05-19|     M|  4000|Michael| Rose|        |
|1978-09-05|     M|  4000| Robert|     |Williams|
|1967-12-01|     F|  4000|  Maria| Anne|   Jones|
|1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+----------+------+------+-------+-----+--------+



In [0]:
# Note
# in usecase5To rename nested columns 

we used two method as we see in usecasecase5 reate new df then yse df.withColumn("new_column", col_expression) then drop existing column.

# in usecase 6. Using col() function – To Dynamically rename all or multiple columns

Each element of the newColumns list is a column expression created using col() function to reference specific columns from the DataFrame, with an alias assigned using the alias() method. This allows you to rename the columns while selecting them or creating new ones




In [0]:
#usecase7. Using toDF() – To change all columns in a PySpark DataFrame
'''
When we have data in a flat structure (without nested) , use toDF() with a new schema to change all column names.
'''
#code
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(*newColumns).printSchema()
df.show()

root
 |-- newCol1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- newCol2: string (nullable = true)
 |-- newCol3: string (nullable = true)
 |-- newCol4: integer (nullable = true)

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [0]:
# Example Data
dataDF = [
    (('James', '', 'Smith'), '1991-04-01', 'M', 3000),
    (('Michael', 'Rose', ''), '2000-05-19', 'M', 4000),
    (('Robert', '', 'Williams'), '1978-09-05', 'M', 4000),
    (('Maria', 'Anne', 'Jones'), '1967-12-01', 'F', 4000),
    (('Jen', 'Mary', 'Brown'), '1980-02-17', 'F', -1)
]

# Define the schema with nested structure
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('dob', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

# Create a Spark session and DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

# 1. PySpark withColumnRenamed – To rename a DataFrame column name
df.withColumnRenamed("dob", "DateOfBirth").printSchema()

# 2. PySpark withColumnRenamed – To rename multiple columns
df2 = df.withColumnRenamed("dob", "DateOfBirth") \
    .withColumnRenamed("salary", "salary_amount")
df2.printSchema()

# 3. Using PySpark StructType – To rename a nested column in DataFrame
from pyspark.sql.functions import col
schema2 = StructType([
    StructField("fname", StringType(), True),
    StructField("middlename", StringType(), True),
    StructField("lname", StringType(), True)
])

df.select(col("name").cast(schema2), col("dob"), col("gender"), col("salary")).printSchema()

# 4. Using Select – To rename nested elements
df.select(col("name.firstname").alias("fname"), 
          col("name.middlename").alias("mname"), 
          col("name.lastname").alias("lname"), 
          col("dob"), col("gender"), col("salary")).printSchema()

# 5. Using PySpark DataFrame withColumn – To rename nested columns
df4 = df.withColumn("fname", col("name.firstname")) \
        .withColumn("mname", col("name.middlename")) \
        .withColumn("lname", col("name.lastname")) \
        .drop("name")
df4.printSchema()

# 6. Using col() function – To Dynamically rename all or multiple columns
newColumns = [col("name.firstname").alias("fname"),
              col("name.middlename").alias("mname"),
              col("name.lastname").alias("lname"),
              col("dob").alias("DateOfBirth"),
              col("gender").alias("sex"),
              col("salary").alias("income")]
df6 = df.select(*newColumns)
df6.printSchema()

# 7. Using toDF() – To change all columns in a PySpark DataFrame
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(*newColumns).printSchema() #(*newColumns) error withoutbracket
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlena