# PySpark `withColumnRenamed()` 
            by Aishwarya Raut

In [9]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

from pyspark.sql.types import StructType,StructField, StringType, IntegerType
schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SP').getOrCreate()
df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# 1. PySpark withColumnRenamed – To rename DataFrame column name

**PySpark withColumnRenamed() Syntax:**

`withColumnRenamed(existingName, newNam)`

In [3]:
df.withColumnRenamed("dob","DateOfBirth").printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gender: integer (nullable = true)



# 2. PySpark withColumnRenamed – To rename multiple columns

In [5]:
df2=df.withColumnRenamed("dob","DateOfBirth").withColumnRenamed("salary","salary_amount")
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gender: integer (nullable = true)



In [10]:
df2=df.withColumnsRenamed({"dob":"DateOfBirth","salary":"salary_amount"})
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)



# 3. Using PySpark StructType – To rename a nested column in Dataframe

In [11]:
schema2 = StructType([
    StructField("fname",StringType()),
    StructField("middlename",StringType()),
    StructField("lname",StringType())])

In [13]:
from pyspark.sql.functions import col
df.select(col("name").cast(schema2),
          col("dob"),col("gender"),col("salary")
         ).printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# 4. Using Select – To rename nested elements.

In [15]:
from pyspark.sql.functions import * 
df.select(col("name.firstname").alias("fname"),
         col("name.lastname").alias("lname"),
         col("name.middlename"),
         col("dob"),
         col("gender"),
         col("salary")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# 5. Using PySpark DataFrame withColumn – To rename nested columns

When you have nested columns on PySpark DatFrame and if you want to rename it, use withColumn on a data frame object to create a new column from an existing and we will need to drop the existing column. 

In [25]:
df4=df.withColumn("fname",col("name.firstname"))\
    .withColumn("lname",col("name.lastname"))\
    .withColumn("mname",col("name.middlename"))\
    .drop("name").printSchema()        

root
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- mname: string (nullable = true)



# 6. Using toDF() – To change all columns in a PySpark DataFrame

In [26]:
newColumns=["new_col1","new_col2","new_col3","new_col4"]
df.toDF(*newColumns).printSchema()

root
 |-- new_col1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- new_col2: string (nullable = true)
 |-- new_col3: string (nullable = true)
 |-- new_col4: integer (nullable = true)

