# Como trabalhar com múltiplos delimitadores

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Ex10") \
    .getOrCreate()

22/07/05 18:26:44 WARN Utils: Your hostname, computador resolves to a loopback address: 127.0.1.1; using 10.0.0.135 instead (on interface wlp2s0)
22/07/05 18:26:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/05 18:26:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Trabalhando com delimitadores duplos

In [4]:
df = spark.read.csv('./data/data.csv', header=True, inferSchema=True)

In [5]:
df.show()

+----------------+
|Id||Nome||Estado|
+----------------+
|    1||Pedro||SP|
|    2||Maria||RJ|
|      3||Ana||SP|
|     4||Joao||MG|
|    5||Lucas||SC|
+----------------+



### Problema: não há a separação dos elementos
### Solução: usar sep='||'

In [6]:
df = spark.read.csv('./data/data.csv', header=True, inferSchema=True, sep='||')

In [7]:
df.show()

+---+-----+------+
| Id| Nome|Estado|
+---+-----+------+
|  1|Pedro|    SP|
|  2|Maria|    RJ|
|  3|  Ana|    SP|
|  4| Joao|    MG|
|  5|Lucas|    SC|
+---+-----+------+



## Trabalhando com múltiplos delimitadores

In [8]:
df2 = spark.read.csv('./data/data_mult_del.csv', header=True, inferSchema=True)

In [9]:
df2.show()

+---+-----+------+-----------+
| Id| Nome|Estado|      Notas|
+---+-----+------+-----------+
|  1|Pedro|    SP|35|45|55|65|
|  2|Maria|    RJ|35|45|55|65|
|  3|  Ana|    SP|35|45|55|65|
|  4| Joao|    MG|35|45|55|65|
|  5|Lucas|    SC|35|45|55|65|
+---+-----+------+-----------+



### Problema: há a separação entre as colunas, mas não entre os elementos da coluna notas

In [12]:
df2 = df2.withColumn('Notas_separadas', split(col('Notas'), '[|]'))

In [13]:
df2.show()

+---+-----+------+-----------+----------------+
| Id| Nome|Estado|      Notas| Notas_separadas|
+---+-----+------+-----------+----------------+
|  1|Pedro|    SP|35|45|55|65|[35, 45, 55, 65]|
|  2|Maria|    RJ|35|45|55|65|[35, 45, 55, 65]|
|  3|  Ana|    SP|35|45|55|65|[35, 45, 55, 65]|
|  4| Joao|    MG|35|45|55|65|[35, 45, 55, 65]|
|  5|Lucas|    SC|35|45|55|65|[35, 45, 55, 65]|
+---+-----+------+-----------+----------------+



### Separando a lista em outras quatro colunas

In [15]:
df2 = df2.withColumn('Nota_!', col('Notas_separadas')[0]
        ).withColumn('Nota_2', col('Notas_separadas')[1]
        ).withColumn('Nota_3', col('Notas_separadas')[2]
        ).withColumn('Nota_4', col('Notas_separadas')[3])

In [17]:
df2.show()

+---+-----+------+-----------+----------------+------+------+------+------+
| Id| Nome|Estado|      Notas| Notas_separadas|Nota_!|Nota_2|Nota_3|Nota_4|
+---+-----+------+-----------+----------------+------+------+------+------+
|  1|Pedro|    SP|35|45|55|65|[35, 45, 55, 65]|    35|    45|    55|    65|
|  2|Maria|    RJ|35|45|55|65|[35, 45, 55, 65]|    35|    45|    55|    65|
|  3|  Ana|    SP|35|45|55|65|[35, 45, 55, 65]|    35|    45|    55|    65|
|  4| Joao|    MG|35|45|55|65|[35, 45, 55, 65]|    35|    45|    55|    65|
|  5|Lucas|    SC|35|45|55|65|[35, 45, 55, 65]|    35|    45|    55|    65|
+---+-----+------+-----------+----------------+------+------+------+------+



### Deleção das colunas Notas e Notas_separadas

In [20]:
df2 = df2.drop('Notas', 'Notas_separadas')

In [21]:
df2.show()

+---+-----+------+------+------+------+------+
| Id| Nome|Estado|Nota_!|Nota_2|Nota_3|Nota_4|
+---+-----+------+------+------+------+------+
|  1|Pedro|    SP|    35|    45|    55|    65|
|  2|Maria|    RJ|    35|    45|    55|    65|
|  3|  Ana|    SP|    35|    45|    55|    65|
|  4| Joao|    MG|    35|    45|    55|    65|
|  5|Lucas|    SC|    35|    45|    55|    65|
+---+-----+------+------+------+------+------+

