<a href="https://colab.research.google.com/github/Amt15/Pyspark/blob/main/How_to_Variable_Number_of_Columns_in_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark -q
!pip install findspark -q

[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[K     |████████████████████████████████| 199 kB 45.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.7/dist-packages/pyspark'

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Variable Number of Columns in CSV")\
        .master("local[*]")\
        .enableHiveSupport()\
        .getOrCreate()

In [5]:
df = spark.read.csv("/content/dynamic_columns_with_schema.csv",header=True)
df.show()

+---+------+---------+----------------+----------+
| id|  name|      loc|         emailid|     phone|
+---+------+---------+----------------+----------+
|  1|  ravi|     null|            null|      null|
|  2|   ram|bangalore|            null|      null|
|  3|prasad|  chennai|sample@gmail.com|9283923456|
|  4|   sam|     pune|            null|      null|
|  5| shyam|    delhi| sample@yahoo.in|      null|
+---+------+---------+----------------+----------+



In [7]:
df1 = spark.read.text("/content/dynamic_columns_without_schema.csv")
df1.show(truncate=False)

+--------------------------------------------+
|value                                       |
+--------------------------------------------+
|1,ravi                                      |
|2,ram,bangalore                             |
|3,prasad,chennai,sample@gmail.com,9283923456|
|4,sam,pune                                  |
|5,shyam,delhi,sample@yahoo.in               |
+--------------------------------------------+



In [9]:
from pyspark.sql.functions import split

df1 = df1.withColumn("splittable_col",split("value",",")).drop("value")
df1.show(truncate=False)

+--------------------------------------------------+
|splittable_col                                    |
+--------------------------------------------------+
|[1, ravi]                                         |
|[2, ram, bangalore]                               |
|[3, prasad, chennai, sample@gmail.com, 9283923456]|
|[4, sam, pune]                                    |
|[5, shyam, delhi, sample@yahoo.in]                |
+--------------------------------------------------+



In [12]:
# dynamically create a columns,for that you need to know the length of the columns

from pyspark.sql.functions import size,max

df1.select("splittable_col",size("splittable_col")).show(truncate=False) # it will give you size of each column

+--------------------------------------------------+--------------------+
|splittable_col                                    |size(splittable_col)|
+--------------------------------------------------+--------------------+
|[1, ravi]                                         |2                   |
|[2, ram, bangalore]                               |3                   |
|[3, prasad, chennai, sample@gmail.com, 9283923456]|5                   |
|[4, sam, pune]                                    |3                   |
|[5, shyam, delhi, sample@yahoo.in]                |4                   |
+--------------------------------------------------+--------------------+



In [16]:
# you can take max value 

df1.select(max(size("splittable_col"))).collect()[0][0]


5

In [17]:
# it will generate 0 to 4 index,i.e 5

for i in range(df1.select(max(size("splittable_col"))).collect()[0][0]):
  df1= df1.withColumn("col"+str(i),df1["splittable_col"][i])

df1.show()

+--------------------+----+------+---------+----------------+----------+
|      splittable_col|col0|  col1|     col2|            col3|      col4|
+--------------------+----+------+---------+----------------+----------+
|           [1, ravi]|   1|  ravi|     null|            null|      null|
| [2, ram, bangalore]|   2|   ram|bangalore|            null|      null|
|[3, prasad, chenn...|   3|prasad|  chennai|sample@gmail.com|9283923456|
|      [4, sam, pune]|   4|   sam|     pune|            null|      null|
|[5, shyam, delhi,...|   5| shyam|    delhi| sample@yahoo.in|      null|
+--------------------+----+------+---------+----------------+----------+



In [18]:
final_df = df1.drop("splittable_col")

final_df.show(truncate=False)

+----+------+---------+----------------+----------+
|col0|col1  |col2     |col3            |col4      |
+----+------+---------+----------------+----------+
|1   |ravi  |null     |null            |null      |
|2   |ram   |bangalore|null            |null      |
|3   |prasad|chennai  |sample@gmail.com|9283923456|
|4   |sam   |pune     |null            |null      |
|5   |shyam |delhi    |sample@yahoo.in |null      |
+----+------+---------+----------------+----------+

