In [1]:
%%init_spark
launcher.packages = ["org.apache.spark:spark-avro_2.12:3.1.2"]

In [23]:
val parquet_file = "../Datasets/parquet/2010-summary.parquet"
val json_file = "../Datasets/json/*"
val csv_file = "../Datasets/csv/*"
val orc_file = "../Datasets/orc/*"
val avro_file = "../Datasets/avro/*"
val image_dir = "../Datasets/cctvVideos/train_images/"
val schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"

parquet_file: String = ../Datasets/parquet/2010-summary.parquet
json_file: String = ../Datasets/json/*
csv_file: String = ../Datasets/csv/*
orc_file: String = ../Datasets/orc/*
avro_file: String = ../Datasets/avro/*
image_dir: String = ../Datasets/cctvVideos/train_images/
schema: String = DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT


# CSV

In [6]:
val df = spark
        .read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(csv_file) // pon aquí la ruta en tu bucket

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [8]:
val df = spark
        .read
        .option("inferSchema", "true")
        .csv(csv_file)

df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 1 more field]


In [10]:
df.show(2)

+-----------------+-------------------+-----+
|              _c0|                _c1|  _c2|
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
|    United States|            Romania|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



# Parquet

In [11]:
val df = spark
        .read
        .format("parquet")
        .option("path", parquet_file)
        .load()

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [12]:
val df = spark
        .read
        .option("inferSchema", "true")
        .parquet(parquet_file)

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [13]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



# JSON 

In [14]:
val df = spark
        .read
        .format("json")
        .option("path", json_file)
        .load()

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [15]:
val df = spark
        .read
        .option("inferSchema", "true")
        .json(json_file)

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [16]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



# ORC

In [17]:
val df = spark
        .read
        .format("orc")
        .option("inferSchema", "true")
        .option("path", orc_file)
        .load()

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [18]:
val df = spark.read
        .format("orc")
        .option("inferSchema", "true")
        .orc(orc_file)

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [19]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



# Avro

In [20]:
val df = spark
        .read
        .format("avro")
        .option("inferSchema", "true")
        .option("path", avro_file)
        .load()

df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [21]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



# Image

In [25]:
val df = spark
        .read
        .format("image")
        .load(image_dir)

df.printSchema()

df.select("image.height", "image.width", "image.nChannels", "image.mode", "label")
       .show(5, truncate=false)

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



df: org.apache.spark.sql.DataFrame = [image: struct<origin: string, height: int ... 4 more fields>, label: int]


# Binary

In [27]:
val path = "../Datasets/cctvVideos/train_images/"
val df = spark
        .read
        .format("binaryFile")
        .option("pathGlobFilter", "*.jpg")
        .load(path)

df.show(5)

+--------------------+-------------------+------+--------------------+-----+
|                path|   modificationTime|length|             content|label|
+--------------------+-------------------+------+--------------------+-----+
|file:/home/jose/E...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|    1|
|file:/home/jose/E...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|    0|
|file:/home/jose/E...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+-------------------+------+--------------------+-----+
only showing top 5 rows



path: String = ../Datasets/cctvVideos/train_images/
df: org.apache.spark.sql.DataFrame = [path: string, modificationTime: timestamp ... 3 more fields]
