# Como trabalhar com arquivos csv com numero de colunas variáveis

In [23]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
        .master("local") \
        .config("spark.sql.autoBroadcastJoinThreshold", -1) \
        .config("spark.executor.memory", "500mb") \
        .appName("Ex4") \
        .getOrCreate()

22/07/03 21:26:54 WARN Utils: Your hostname, computador resolves to a loopback address: 127.0.1.1; using 10.0.0.135 instead (on interface wlp2s0)
22/07/03 21:26:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/03 21:26:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [72]:
df = spark.read.csv('./data/data.csv', header=True)

In [73]:
df.show()

+---+-----+---------+-----+----+
| Id| Nome|    Local|Idade|Sexo|
+---+-----+---------+-----+----+
|  1|Pedro|     null| null|null|
|  2| Joao|Sao Paulo| null|null|
|  3|  Ana|     null| null|null|
|  4|Maria|   Recife|   40|   f|
|  5|Lucas| Campinas|   22|null|
+---+-----+---------+-----+----+



In [77]:
# Lê o arquivo como texto, pois ao ler como csv, há a leittura das colunas que a primeira linha participa (2 colunas)
df_without_columns = spark.read.text('./data/data_without_columns.csv')

In [78]:
df_without_columns.show()

+-------------------+
|              value|
+-------------------+
|           1,Pedro,|
|  2,Joao,Sao Paulo,|
|             3,Ana,|
|4,Maria,Recife,40,f|
|5,Lucas,Campinas,22|
+-------------------+



In [79]:
# Cria-se uma coluna contendo os valores da tabela original e, por fim, exclui-se a tabela original
df_without_columns = df_without_columns.withColumn('splittable_col', split('value', ',').alias('splittable_col')).drop('value')

In [80]:
display(df_without_columns)

DataFrame[splittable_col: array<string>]

In [81]:
df_without_columns.show()

+--------------------+
|      splittable_col|
+--------------------+
|        [1, Pedro, ]|
|[2, Joao, Sao Pau...|
|          [3, Ana, ]|
|[4, Maria, Recife...|
|[5, Lucas, Campin...|
+--------------------+



In [82]:
# Cria novas colunas a partir de informações da coluna 'splittable_col'
for i in range(df_without_columns.select(max(size('splittable_col'))).collect()[0][0]):
    df_without_columns = df_without_columns.withColumn('col' + str(i), df_without_columns['splittable_col'][i])

In [83]:
df_without_columns.show()

+--------------------+----+-----+---------+----+----+
|      splittable_col|col0| col1|     col2|col3|col4|
+--------------------+----+-----+---------+----+----+
|        [1, Pedro, ]|   1|Pedro|         |null|null|
|[2, Joao, Sao Pau...|   2| Joao|Sao Paulo|    |null|
|          [3, Ana, ]|   3|  Ana|         |null|null|
|[4, Maria, Recife...|   4|Maria|   Recife|  40|   f|
|[5, Lucas, Campin...|   5|Lucas| Campinas|  22|null|
+--------------------+----+-----+---------+----+----+



In [84]:
# Exclui-se a coluna 'splittable_col'
df_final = df_without_columns.drop('splittable_col')

In [85]:
df_final.show()

+----+-----+---------+----+----+
|col0| col1|     col2|col3|col4|
+----+-----+---------+----+----+
|   1|Pedro|         |null|null|
|   2| Joao|Sao Paulo|    |null|
|   3|  Ana|         |null|null|
|   4|Maria|   Recife|  40|   f|
|   5|Lucas| Campinas|  22|null|
+----+-----+---------+----+----+

