<a href="https://colab.research.google.com/github/ArulrajGopal/Spark-Guide/blob/main/GogCol/13_Structs_withCols.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession

#create spark session
spark= SparkSession.builder.appName('mysparksession').getOrCreate()

#create spark context
sc = spark.sparkContext

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=ead0b6e5b18667b214867265800ffb5a890d4ab0a344ca2048791af512159f8c
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [3]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

In [4]:
schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
df.show()

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [6]:
# Example 1
df.withColumnRenamed("dob","DateOfBirth").show()

+--------------------+-----------+------+------+
|                name|DateOfBirth|gender|salary|
+--------------------+-----------+------+------+
|    {James, , Smith}| 1991-04-01|     M|  3000|
|   {Michael, Rose, }| 2000-05-19|     M|  4000|
|{Robert, , Williams}| 1978-09-05|     M|  4000|
|{Maria, Anne, Jones}| 1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}| 1980-02-17|     F|    -1|
+--------------------+-----------+------+------+



In [7]:
df.withColumn("fname",col("name.firstname")) \
      .withColumn("mname",col("name.middlename")) \
      .withColumn("lname",col("name.lastname")) \
      .drop("name")\
      .show()

+----------+------+------+-------+-----+--------+
|       dob|gender|salary|  fname|mname|   lname|
+----------+------+------+-------+-----+--------+
|1991-04-01|     M|  3000|  James|     |   Smith|
|2000-05-19|     M|  4000|Michael| Rose|        |
|1978-09-05|     M|  4000| Robert|     |Williams|
|1967-12-01|     F|  4000|  Maria| Anne|   Jones|
|1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+----------+------+------+-------+-----+--------+



In [9]:
df.withColumn("timestamp_column", current_timestamp())\
 .withColumn("salary",col("Salary")/10)\
 .show(truncate=False)

+--------------------+----------+------+------+--------------------------+
|name                |dob       |gender|salary|timestamp_column          |
+--------------------+----------+------+------+--------------------------+
|{James, , Smith}    |1991-04-01|M     |300.0 |2024-02-01 06:05:39.411185|
|{Michael, Rose, }   |2000-05-19|M     |400.0 |2024-02-01 06:05:39.411185|
|{Robert, , Williams}|1978-09-05|M     |400.0 |2024-02-01 06:05:39.411185|
|{Maria, Anne, Jones}|1967-12-01|F     |400.0 |2024-02-01 06:05:39.411185|
|{Jen, Mary, Brown}  |1980-02-17|F     |-0.1  |2024-02-01 06:05:39.411185|
+--------------------+----------+------+------+--------------------------+

