In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession
 .builder
 .appName("Avro") 
 .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.1.2")
 .getOrCreate())

In [45]:
parquet_file = "../Datasets/parquet/2010-summary.parquet"
json_file = "../Datasets/json/*"
csv_file = "../Datasets/csv/*"
orc_file = "../Datasets/orc/*"
avro_file = "../Datasets/avro/*"
image_dir = "../Datasets/cctvVideos/train_images/"
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"

# CSV

In [12]:
df = spark\
        .read\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .csv(csv_file) # pon aquí la ruta en tu bucket

In [22]:
df = spark\
        .read\
        .option("inferSchema", "true")\
        .csv(csv_file)

In [21]:
df.toPandas()

Unnamed: 0,_c0,_c1,_c2
0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
1,United States,Romania,1
2,United States,Ireland,264
3,United States,India,69
4,Egypt,United States,24
...,...,...,...
1503,United States,French Guiana,4
1504,United States,Saint Kitts and Nevis,123
1505,United States,Haiti,193
1506,"Bonaire, Sint Eustatius, and Saba",United States,62


# Parquet

In [18]:
df = spark\
        .read\
        .format("parquet")\
        .option("path", parquet_file)\
        .load()

In [23]:
df = spark\
        .read\
        .option("inferSchema", "true")\
        .parquet(parquet_file)

In [24]:
df.toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,1
1,United States,Ireland,264
2,United States,India,69
3,Egypt,United States,24
4,Equatorial Guinea,United States,1
...,...,...,...
250,United States,French Guiana,1
251,United States,Haiti,226
252,United States,Uganda,1
253,"Bonaire, Sint Eustatius, and Saba",United States,16


# JSON 

In [26]:
df = spark\
        .read\
        .format("json")\
        .option("path", json_file)\
        .load()

In [29]:
df = spark\
        .read\
        .option("inferSchema", "true")\
        .json(json_file)

In [31]:
df.toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,15
1,United States,Croatia,1
2,United States,Ireland,344
3,Egypt,United States,15
4,United States,India,62
...,...,...,...
1497,United States,French Guiana,4
1498,United States,Saint Kitts and Nevis,123
1499,United States,Haiti,193
1500,"Bonaire, Sint Eustatius, and Saba",United States,62


# ORC

In [58]:
df = spark\
        .read\
        .format("orc")\
        .option("inferSchema", "true")\
        .option("path", orc_file)\
        .load()

In [62]:
df = spark.read\
        .format("orc")\
        .option("inferSchema", "true")\
        .orc(orc_file)

In [63]:
df.toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,1
1,United States,Ireland,264
2,United States,India,69
3,Egypt,United States,24
4,Equatorial Guinea,United States,1
...,...,...,...
250,United States,French Guiana,1
251,United States,Haiti,226
252,United States,Uganda,1
253,"Bonaire, Sint Eustatius, and Saba",United States,16


# Avro

In [41]:
df = spark\
        .read\
        .format("avro")\
        .option("inferSchema", "true")\
        .option("path", avro_file)\
        .load()

In [42]:
df.toPandas()

Unnamed: 0,DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
0,United States,Romania,1
1,United States,Ireland,264
2,United States,India,69
3,Egypt,United States,24
4,Equatorial Guinea,United States,1
...,...,...,...
250,United States,French Guiana,1
251,United States,Haiti,226
252,United States,Uganda,1
253,"Bonaire, Sint Eustatius, and Saba",United States,16


# Image

In [47]:
from pyspark.ml import image


df = spark\
        .read\
        .format("image")\
        .load(image_dir)

df.printSchema()

df.select("image.height", "image.width", "image.nChannels", "image.mode", "label").show(5, truncate=False)

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



In [48]:
df.toPandas()

Unnamed: 0,image,label
0,(file:///home/jose/Escritorio/Templates/Datase...,0
1,(file:///home/jose/Escritorio/Templates/Datase...,1
2,(file:///home/jose/Escritorio/Templates/Datase...,0
3,(file:///home/jose/Escritorio/Templates/Datase...,0
4,(file:///home/jose/Escritorio/Templates/Datase...,0
...,...,...
446,(file:///home/jose/Escritorio/Templates/Datase...,0
447,(file:///home/jose/Escritorio/Templates/Datase...,1
448,(file:///home/jose/Escritorio/Templates/Datase...,0
449,(file:///home/jose/Escritorio/Templates/Datase...,0


# Binary

In [53]:
path = "../Datasets/cctvVideos/train_images/"
df = spark\
        .read\
        .format("binaryFile")\
        .option("pathGlobFilter", "*.jpg")\
        .load(path)

df.show(5)

+--------------------+-------------------+------+--------------------+-----+
|                path|   modificationTime|length|             content|label|
+--------------------+-------------------+------+--------------------+-----+
|file:/home/jose/E...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|    1|
|file:/home/jose/E...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+-------------------+------+--------------------+-----+
only showing top 5 rows



In [52]:
df.toPandas()

Unnamed: 0,path,modificationTime,length,content,label
0,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,55037,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
1,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,54634,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",1
2,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,54624,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
3,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,54505,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
4,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,54475,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
...,...,...,...,...,...
446,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,44534,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
447,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,44329,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",1
448,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,44277,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0
449,file:/home/jose/Escritorio/Templates/Datasets/...,2021-04-15 02:34:17,44124,"[255, 216, 255, 224, 0, 16, 74, 70, 73, 70, 0,...",0


# SQL

In [25]:
!sql
CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
    USING parquet
    OPTIONS (
      path "/databricks-datasets/definitive-guide/data/flight-data/parquet/2010-summary.parquet"
    )

SyntaxError: invalid syntax (<ipython-input-25-4a36e3e13be8>, line 2)