In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
       .builder
       .appName("Image data source example")
       .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/07 12:29:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/07 12:29:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from pyspark.ml import image

In [4]:
images_dir = "./train_images/"

In [5]:
images_df = spark.read.format("image").load(images_dir)

In [6]:
images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



In [7]:
(images_df.select("image.height", 
                  "image.width", 
                  "image.nChannels", 
                  "image.mode", 
                  "label")
         .show(5, truncate=False))

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



In [8]:
(images_df.select("image.origin", "image.height")
 .where(col("image.height") > 200)
 .show(5, truncate=False))

+-------------------------------------------------------------------------------------------------------------+------+
|origin                                                                                                       |height|
+-------------------------------------------------------------------------------------------------------------+------+
|file:///home/amit/Documents/CS535-resources/examples/spark/datasets/train_images/label=0/LeftBagframe0004.jpg|288   |
|file:///home/amit/Documents/CS535-resources/examples/spark/datasets/train_images/label=1/LeftBagframe0040.jpg|288   |
|file:///home/amit/Documents/CS535-resources/examples/spark/datasets/train_images/label=0/LeftBagframe0005.jpg|288   |
|file:///home/amit/Documents/CS535-resources/examples/spark/datasets/train_images/label=0/LeftBagframe0015.jpg|288   |
|file:///home/amit/Documents/CS535-resources/examples/spark/datasets/train_images/label=0/LeftBagframe0017.jpg|288   |
+-----------------------------------------------

In [9]:
binary_files_df = (spark.read.format("binaryFile")
                   .option("pathGlobFilter", "*.jpg")
                   .option("recursiveFileLookup", "true")
                   .load("./train_images"))

In [10]:
binary_files_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)



In [11]:
binary_files_df.show(5)

+--------------------+--------------------+------+--------------------+
|                path|    modificationTime|length|             content|
+--------------------+--------------------+------+--------------------+
|file:/home/amit/D...|2023-11-13 18:10:...| 55037|[FF D8 FF E0 00 1...|
|file:/home/amit/D...|2023-11-13 18:10:...| 54634|[FF D8 FF E0 00 1...|
|file:/home/amit/D...|2023-11-13 18:10:...| 54624|[FF D8 FF E0 00 1...|
|file:/home/amit/D...|2023-11-13 18:10:...| 54505|[FF D8 FF E0 00 1...|
|file:/home/amit/D...|2023-11-13 18:10:...| 54475|[FF D8 FF E0 00 1...|
+--------------------+--------------------+------+--------------------+
only showing top 5 rows



In [12]:
spark.stop()