##### Fernando Amaral
##### Importação 

In [None]:
# basic import examples

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("importacao").getOrCreate()

In [2]:
# first we define a schema 
arqschema = "id INT, nome STRING, status STRING, cidade STRING, vendas INT, data DATE"
despachantes = spark.read.csv("despachantes.csv", header=False, schema=arqschema)
despachantes.show(5)

+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  1|   Carminda Pestana| Ativo|  Santa Maria|    23|2020-08-11|
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
+---+-------------------+------+-------------+------+----------+
only showing top 5 rows



In [3]:
despachantes.schema

StructType(List(StructField(id,IntegerType,true),StructField(nome,StringType,true),StructField(status,StringType,true),StructField(cidade,StringType,true),StructField(vendas,IntegerType,true),StructField(data,DateType,true)))

In [4]:
# now we use inferSchema=True
desp_autoschema = spark.read.load("despachantes.csv", format="csv", header=False, inferSchema=True, sep="," )
desp_autoschema.show(5)

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
+---+-------------------+-----+-------------+---+----------+
only showing top 5 rows



In [5]:
desp_autoschema.schema

StructType(List(StructField(_c0,IntegerType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true),StructField(_c4,IntegerType,true),StructField(_c5,StringType,true)))

In [6]:
# other formats
par = spark.read.format("parquet").load("despachantes.parquet")
par.show()

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+



In [7]:
js = spark.read.format("json").load("despachantes.json")
js.show()

+-------------+-----------+---+-------------------+------+------+
|       cidade|       data| id|               nome|status|vendas|
+-------------+-----------+---+-------------------+------+------+
|  Santa Maria| 2020-08-11|  1|   Carminda Pestana| Ativo|    23|
|Novo Hamburgo| 2020-03-05|  2|    Deolinda Vilela| Ativo|    34|
| Porto Alegre| 2020-02-05|  3|   Emídio Dornelles| Ativo|    34|
| Porto Alegre| 2020-02-05|  4|Felisbela Dornelles| Ativo|    36|
| Porto Alegre| 2020-02-05|  5|     Graça Ornellas| Ativo|    12|
| Porto Alegre| 2019-01-05|  6|   Matilde Rebouças| Ativo|    22|
|  Santa Maria| 2019-10-05|  7|    Noêmia   Orriça| Ativo|    45|
| Porto Alegre| 2020-03-05|  8|      Roque Vásquez| Ativo|    65|
| Porto Alegre| 2018-05-05|  9|      Uriel Queiroz| Ativo|    54|
| Porto Alegre| 2020-09-05| 10|   Viviana Sequeira| Ativo|     0|
+-------------+-----------+---+-------------------+------+------+



In [8]:
orc = spark.read.format("orc").load("despachantes.orc")
orc.show()

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+

