In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName('PySparkLearning').getOrCreate()

In [2]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
          (('Michael','Rose',''),'2000-05-19','M',4000),
          (('Robert','','Williams'),'1978-09-05','M',4000),
          (('Maria','Anne','Jones'),'1967-12-01','F',4000),
          (('Jen','Mary','Brown'),'1980-02-17','F',-1)
        ]

In [3]:
schema = StructType([
            StructField('name', 
                            StructType([
                                StructField('first_name', StringType()), 
                                StructField('middle_name', StringType()),
                                StructField('last_name', StringType())
                            ])),
            StructField('date_of_birth', StringType()),
            StructField('gender', StringType()),
            StructField('salary', IntegerType())
])

In [4]:
df = spark.createDataFrame(data = dataDF, schema = schema)
df.show()
df.printSchema()

+--------------------+-------------+------+------+
|                name|date_of_birth|gender|salary|
+--------------------+-------------+------+------+
|    [James, , Smith]|   1991-04-01|     M|  3000|
|   [Michael, Rose, ]|   2000-05-19|     M|  4000|
|[Robert, , Williams]|   1978-09-05|     M|  4000|
|[Maria, Anne, Jones]|   1967-12-01|     F|  4000|
|  [Jen, Mary, Brown]|   1980-02-17|     F|    -1|
+--------------------+-------------+------+------+

root
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



### PySpark withColumnRenamed – To rename single and multiple DataFrame columns

`withColumnRenamed(existingName, newNam)` - Returns a new DataFrame with a column renamed.

In [5]:
# Rename Single Column

df1 = df.withColumnRenamed('date_of_birth','dob')
df1.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [6]:
#  Rename Multiple Columns

df2 = df.withColumnRenamed('date_of_birth', 'birth_date')\
        .withColumnRenamed('gender', 'sex')

df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- salary: integer (nullable = true)



### To rename a nested column in Dataframe

#### Using "Select" -

In [7]:
from pyspark.sql.functions import col

df3 = df.select(col("name.first_name").alias("fname"),
                col("name.middle_name").alias("mname"),
                col("name.last_name").alias("lname"),
                col("date_of_birth"),
                col("gender"),
                col("salary")
               )

df3.printSchema()

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



#### Using PySpark "StructType" – 
Changing a column name on nested data is not straight forward and we can do this by creating a new schema with new DataFrame columns using StructType and use it using cast function as shown below.

In [8]:
new_schema = StructType([
                            StructField('fname', StringType()),
                            StructField('mname', StringType()),
                            StructField('lname', StringType())
                        ])

In [9]:
df4 = df.select(col("name").cast(new_schema),
                col("date_of_birth"),
                col("gender"),
                col("salary")
               )
df4.printSchema() 

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- mname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



#### Using PySpark DataFrame "withColumn" -

When you have nested columns on PySpark DataFrame and if you want to rename it, use `withColumn` on a data frame object to create a new column from an existing and we will need to drop the existing column. 

In [10]:
df5 = df.withColumn("fname",col("name.first_name")) \
        .withColumn("mname",col("name.middle_name")) \
        .withColumn("lname",col("name.last_name")) \
        .drop("name")

df5.printSchema()

root
 |-- date_of_birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)



#### Using toDF() – To change all columns in a PySpark DataFrame

When we have data in a flat structure (without nested) , use `toDF()` with a new schema to change all column names.

In [12]:
newColumns = ["newCol-1","newCol-2","newCol-3","newCol-4"]
df.toDF(*newColumns).printSchema()

root
 |-- newCol-1: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- newCol-2: string (nullable = true)
 |-- newCol-3: string (nullable = true)
 |-- newCol-4: integer (nullable = true)

