# **1. Modules dépendances et bibliothèques**

In [None]:
#!pip install pyspark

import os
import zipfile
import numpy as np
import io
import pandas as pd

from PIL import Image
from pyspark.sql import SparkSession
from pyspark.sql.functions import element_at, split, pandas_udf, PandasUDFType, col
from pyspark.sql.functions import udf
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import VectorUDT, Vectors
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import img_to_array

# **2. Importation des données et définition du chemin d'accès**

In [2]:
os.environ['KAGGLE_USERNAME'] = "*******" 
os.environ['KAGGLE_KEY'] = "*****************************" 

!kaggle datasets download -d moltean/fruits

with zipfile.ZipFile("fruits.zip", 'r') as zip_ref:
    zip_ref.extractall("images")

Downloading fruits.zip to /content
100% 1.28G/1.28G [00:45<00:00, 33.0MB/s]
100% 1.28G/1.28G [00:45<00:00, 30.1MB/s]


In [3]:
PATH = os.getcwd()
PATH_Data = PATH+'/images/fruits-360-original-size/fruits-360-original-size/Test'
PATH_Result = PATH+'/resultMobileNet'
print('PATH:        '+\
      PATH+'\nPATH_Data:   '+\
      PATH_Data+'\nPATH_Result: '+PATH_Result)

PATH:        /content
PATH_Data:   /content/images/fruits-360-original-size/fruits-360-original-size/Test
PATH_Result: /content/resultMobileNet


# **3. Création d'une session Spark**

In [4]:
spark = (SparkSession
             .builder
             .appName('P8')
             .master('local')
             .config("spark.sql.parquet.writeLegacyFormat", 'true')
             .getOrCreate()
)

sc = spark.sparkContext

spark

# **4. Chargement des images au format binaire**

In [5]:
images = spark.read.format("binaryFile") \
  .option("pathGlobFilter", "*.jpg") \
  .option("recursiveFileLookup", "true") \
  .load(PATH_Data)

images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5,False))

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None
+--------------------------------------------------------------------------------------------------+-----------+
|path                                                                                              |label      |
+--------------------------------------------------------------------------------------------------+-----------+
|file:/content/images/fruits-360-original-size/fruits-360-original-size/Test/apple_hit_1/r0_115.jpg|apple_hit_1|
|file:/content/images/fruits-360-original-size/fruits-360-original-size/Test/apple_hit_1/r0_119.jpg|apple_hit_1|
|file:/content/images/fruits-360-original-size/fruits-360-original-size/Test/apple_hit_1/r0_107.jpg|apple_hit_1|
|file:/content/images/fruits-360-original-size/fruits-360-original-size/Test/apple_hit_1/r0_143.jpg|apple_hit_1|

# **5. Préparation du modèle MobileNetV2**

In [None]:
model = MobileNetV2(weights='imagenet',
                    include_top=True,
                    input_shape=(224, 224, 3))

new_model = Model(inputs=model.input, outputs=model.layers[-2].output)
brodcast_weights = sc.broadcast(new_model.get_weights())
# new_model.summary()

In [None]:
def model_fn():
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    for layer in model.layers:
        layer.trainable = False
    new_model = Model(inputs=model.input,
                  outputs=model.layers[-2].output)
    new_model.set_weights(brodcast_weights.value)
    return new_model

def preprocess(content):
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

def featurize_series(model, content_series):
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    output = [p.flatten() for p in preds]
    return pd.Series(output)

@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

# **6. Extraction de features**

In [8]:
features_df = images.repartition(20).select(col("path"),
                                            col("label"),
                                            featurize_udf("content").alias("features")
                                           )

features_df.write.mode("overwrite").parquet(PATH_Result)

# **7. Réduction dimensionnelle**

In [9]:
data = spark.read.parquet(PATH_Result)
to_dense_vector = udf(lambda arr: Vectors.dense(arr), VectorUDT()) 
data = data.withColumn("features_dense", to_dense_vector("features")) 
pca = PCA(k=2, inputCol="features_dense", outputCol="pca_features") 
model = pca.fit(data) 
transformed_data = model.transform(data) 
result = transformed_data.select("pca_features") 
result.write.mode("overwrite").parquet("resultPCA") 

def export_pca_features_to_csv(parquet_file_path, csv_file_path):
    result_pca = pd.read_parquet('/content/resultPCA', engine='pyarrow')
    pca_features_numeric = []  
    for row in result_pca['pca_features']:
        pca_features_numeric.append([float(x) for x in row['values']])

    pca_features_df = pd.DataFrame(pca_features_numeric, columns=['pca_feature_1', 'pca_feature_2'])
    pca_features_df.to_csv(csv_file_path, index=False) 

export_pca_features_to_csv('/content/resultPCA', '/content/resultPCA/resultPCA.csv')     

# **8. Visualisations des résultats**

In [10]:
df1 = pd.read_parquet(PATH_Result, engine='pyarrow')
df2 = pd.read_csv('/content/resultPCA/resultPCA.csv') 
df = pd.concat([df1, df2], axis=1)
df.to_csv('matrice.csv', index=False) 
df.head()

Unnamed: 0,path,label,features,pca_feature_1,pca_feature_2
0,file:/content/images/fruits-360-original-size/...,apple_hit_1,"[0.24300367, 0.40524203, 1.7881588, 0.0, 0.0, ...",5.922424,-1.034688
1,file:/content/images/fruits-360-original-size/...,apple_hit_1,"[0.38683483, 0.23985375, 1.6046269, 0.01696724...",6.375025,-2.99111
2,file:/content/images/fruits-360-original-size/...,apple_hit_1,"[1.0393977, 0.20693327, 1.0663801, 0.0, 1.5272...",4.4653,3.755692
3,file:/content/images/fruits-360-original-size/...,apple_hit_1,"[0.23689479, 1.2665803, 0.3642044, 0.0, 0.4414...",5.20326,4.509398
4,file:/content/images/fruits-360-original-size/...,apple_hit_1,"[0.40759927, 0.0019354918, 0.0, 0.0, 0.0042103...",4.358709,2.403972
