In [None]:
# import os
# os.environ["HADOOP_HOME"] = "C:/hadoop"
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"  # Use your real JDK path!
# os.environ["SPARK_HOME"] = "~/spark"

import os, sys
os.environ["PYSPARK_PYTHON"] = sys.executable      # chemin complet …\python.exe
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

os.environ["CUDA_VISIBLE_DEVICES"] = ""

os.environ["JAVA_HOME"] = "C:\\Zulu\\zulu-11"
os.environ["SPARK_HOME"] = "C:\\spark"
os.environ["HADOOP_HOME"] = "C:\\hadoop"


In [None]:
batch_size = 16  # Reduce it to fit in memory


In [None]:
# import tensorflow as tf
# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#     try:
#         for gpu in gpus:
#             tf.config.experimental.set_memory_growth(gpu, True)
#     except RuntimeError as e:
#         print(e)


In [None]:
# tf.config.experimental.set_virtual_device_configuration(
#     gpus[0],
#     [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])  # Limit to 4GB



In [None]:
# !pip install Pandas pillow tensorflow pyspark pyarrow

In [None]:
import pandas as pd
from PIL import Image
import numpy as np
import io
import os

import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split
from pyspark.sql import SparkSession

In [None]:
# PATH = os.getcwd()
# PATH_Data = PATH+'/data/Test1'
# PATH_Result = PATH+'/data/Results'
# print('PATH:        '+\
#       PATH+'\nPATH_Data:   '+\
#       PATH_Data+'\nPATH_Result: '+PATH_Result)

PATH_Data = 'data/Test1'
PATH_Result = 'data/Results'

print('PATH:        '+\
      '\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)


In [None]:
import pyspark
print(pyspark.__version__)


In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("P8")
        .master("local[*]")
        # même interpréteur pour tout le monde
        .config("spark.executorEnv.PYSPARK_PYTHON",  sys.executable)
        .config("spark.driver.python",               sys.executable)
        # facultatif mais utile :
        .config("spark.python.worker.reuse",         "true")     # évite de relancer N workers
        .config("spark.python.worker.memory",        "2g")       # limite mémoire par worker
        .getOrCreate()
)



In [None]:
sc = spark.sparkContext

In [None]:
spark

In [None]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "4")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")

In [None]:
images = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

In [None]:
images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5,False))

In [None]:
model = MobileNetV2(weights='imagenet',
                    include_top=True,
                    input_shape=(224, 224, 3))

In [None]:
new_model = Model(inputs=model.input,
                  outputs=model.layers[-2].output)

In [None]:
# new_model.summary()

In [None]:
brodcast_weights = sc.broadcast(new_model.get_weights())

In [None]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    for layer in model.layers:
        layer.trainable = False
    new_model = Model(inputs=model.input,
                  outputs=model.layers[-2].output)
    new_model.set_weights(brodcast_weights.value)
    return new_model

In [None]:
def preprocess(content):
    try:
        img = Image.open(io.BytesIO(content)).resize([224, 224])
        arr = img_to_array(img)
        return preprocess_input(arr)
    except Exception as e:
        print("Erreur image :", e)
        return np.zeros((224, 224, 3), dtype=np.float32)


def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

In [None]:
# spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

In [None]:
features_df = images.repartition(2).select(
    col("path"), col("label"),
    featurize_udf("content").alias("features")
)


In [None]:
print(PATH_Result)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
import numpy as np, os, sys, io
from PIL import Image

def just_size(content):
    try:
        img = Image.open(io.BytesIO(content))
        return [float(img.size[0]), float(img.size[1])]
    except Exception:
        return [0.0, 0.0]

dummy_udf = udf(just_size, ArrayType(FloatType()))

test_df = images.limit(50).select(dummy_udf("content").alias("wh"))
test_df.show(3)


In [None]:
paths = [row.path for row in images.select("path").collect()]

def extract(path):
    img = Image.open(path).resize((224,224))
    arr = preprocess_input(img_to_array(img))
    feat = backbone.predict(arr[None])[0].flatten()
    return (path, feat.tolist())

import multiprocessing as mp
with mp.Pool() as pool:
    rows = pool.map(extract, paths)

# puis :
spark.createDataFrame(rows, ["path","features"]).write.parquet(PATH_Result)


In [None]:
# features_df.show(3)


In [None]:
# features_df.write.mode("overwrite").parquet(PATH_Result)

In [None]:
df = pd.read_parquet(PATH_Result, engine='pyarrow')

In [None]:
df.head()

In [None]:
df.loc[0,'features'].shape

In [None]:
# L'exécution de cette cellule démarre l'application Spark

In [None]:
%%info

In [None]:
import pandas as pd
import numpy as np
import io
import os
import tensorflow as tf
from PIL import Image
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split

In [None]:
PATH = 's3://p8-data'
PATH_Data = PATH+'/Test'
PATH_Result = PATH+'/Results'
print('PATH:        '+\
      PATH+'\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)

In [None]:
images = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

In [None]:
images.show(5)

In [None]:
images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5,False))

In [None]:
model = MobileNetV2(weights='imagenet',
                    include_top=True,
                    input_shape=(224, 224, 3))

In [None]:
new_model = Model(inputs=model.input,
                  outputs=model.layers[-2].output)

In [None]:
brodcast_weights = sc.broadcast(new_model.get_weights())

In [None]:
new_model.summary()

In [None]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    for layer in model.layers:
        layer.trainable = False
    new_model = Model(inputs=model.input,
                  outputs=model.layers[-2].output)
    new_model.set_weights(brodcast_weights.value)
    return new_model

In [None]:
def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

In [None]:
# spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

In [None]:
features_df = images.repartition(24).select(col("path"),
                                            col("label"),
                                            featurize_udf("content").alias("features")
                                           )

In [None]:
print(PATH_Result)

In [None]:
features_df.show(3)


In [None]:
features_df.write.mode("overwrite").parquet(PATH_Result)

In [None]:
df = pd.read_parquet(PATH_Result, engine='pyarrow')

In [None]:
df.head()

In [None]:
df.loc[0,'features'].shape

In [None]:
df.shape