####Handle How to delimeter | inside the column

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.sql.functions import *

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Create DataFrame and Insert Data") \
    .getOrCreate()

# Define schema for DataFrame
schema = StructType([
    StructField("ID", IntegerType(), False),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Marks", StringType(), True)
])

# Define data
data = [
    (1, "Arabinda", 23, "32|49|39"),
    (2, "Shyam", 34, "32|90|31"),
    (3, "Raghu", 42, "30|98|43"),
    (4, "John", 27, "43|87|56"),
    (5, "Su", 29, "65|76|29"),
    (6, "Manderic", 36, "89|45|90")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Split Marks column into an array of integers
#df = df.withColumn("Marks", split(df["Marks"], "\|").cast(ArrayType(IntegerType())))

# Show DataFrame
df.show()


+---+--------+---+--------+
| ID|    Name|Age|   Marks|
+---+--------+---+--------+
|  1|Arabinda| 23|32|49|39|
|  2|   Shyam| 34|32|90|31|
|  3|   Raghu| 42|30|98|43|
|  4|    John| 27|43|87|56|
|  5|      Su| 29|65|76|29|
|  6|Manderic| 36|89|45|90|
+---+--------+---+--------+



####Approach-1


In [0]:
df_req1=df.withColumn("Phy",split(df["Marks"],"\|")[0])\
.withColumn("Chem",split(df["Marks"],"\|")[1])\
.withColumn("Math",split(df["Marks"],"\|")[2]).drop(col("Marks"))
df_req1.show()

+---+--------+---+---+----+----+
| ID|    Name|Age|Phy|Chem|Math|
+---+--------+---+---+----+----+
|  1|Arabinda| 23| 32|  49|  39|
|  2|   Shyam| 34| 32|  90|  31|
|  3|   Raghu| 42| 30|  98|  43|
|  4|    John| 27| 43|  87|  56|
|  5|      Su| 29| 65|  76|  29|
|  6|Manderic| 36| 89|  45|  90|
+---+--------+---+---+----+----+



####Dynamically create the columns

In [0]:
# Split the Marks column into an array
df_split = df.withColumn("Marks", split(df["Marks"], "\|"))

# Determine the number of subjects
num_subjects = len(df_split.select("Marks").take(1)[0][0])

# Generate column names
#column_names = ["Subject_" + str(i+1) for i in range(num_subjects)]
column_list = ["Phy", "Chem", "Math"]

# Dynamically create new columns using vectorized operations
for i, col_name in enumerate(column_list):
    df_split = df_split.withColumn(col_name, df_split["Marks"].getItem(i).cast(IntegerType()))
 

# Drop the original Marks column
df_final = df_split.drop("Marks")

# Show the resulting DataFrame
df_final.show()


+---+--------+---+---+----+----+
| ID|    Name|Age|Phy|Chem|Math|
+---+--------+---+---+----+----+
|  1|Arabinda| 23| 32|  49|  39|
|  2|   Shyam| 34| 32|  90|  31|
|  3|   Raghu| 42| 30|  98|  43|
|  4|    John| 27| 43|  87|  56|
|  5|      Su| 29| 65|  76|  29|
|  6|Manderic| 36| 89|  45|  90|
+---+--------+---+---+----+----+



Let's break down the line df_split.select("Marks").take(1)[0][0]:

df_split.select("Marks"): This part of the line selects the column named "Marks" from the DataFrame df_split. It returns a new DataFrame containing only the selected column.
.take(1): This part of the line retrieves the first row from the DataFrame. The take() function is used to return a list of rows, and take(1) is used here to retrieve only the first row.

[0]: This part of the line accesses the first element of the list returned by take(1). Since take(1) returns a list containing only one element (the first row), accessing [0] retrieves that first element.

[0] (second occurrence): This part of the line accesses the first (and only) column value from the first row of the DataFrame. Since the previous step [0] returned the first row, accessing [0] again retrieves the value of the first column in that row.