In [35]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.models import Model



In [36]:
# Chemins vers les images et les données CSV
images_path = "C:/Users/user/OneDrive/DATASIENCETEST/PROJET/Data/Update/images/image_train"
X_csv_path = "C:/Users/user/OneDrive/DATASIENCETEST/PROJET/Data/Update/X_train_update.csv"
y_csv_path = "C:/Users/user/OneDrive/DATASIENCETEST/PROJET/Data/Update/Y_train_CVw08PX.csv"

# Chargement des données
X_df = pd.read_csv(X_csv_path)
y_df = pd.read_csv(y_csv_path)

# Nettoyage de y_df pour enlever toute colonne superflue (comme 'Unnamed: 0')
y_df.drop(columns=['Unnamed: 0'], inplace=True)
X_df.drop(columns=['Unnamed: 0'], inplace=True)

# Assurez-vous que les colonnes 'imageid' et 'productid' sont correctes pour correspondre aux noms de fichiers
X_df['image_name'] = X_df.apply(lambda row: f"image_{row['imageid']}_product_{row['productid']}.jpg", axis=1)
X_df['image_path'] = images_path + '/' + X_df['image_name']

# Concaténation de X_df avec y_df sur l'index
dataset = pd.concat([X_df, y_df], axis=1)




In [37]:
print(y_df.head())
print(y_df.columns)
print(X_df.head())
print(X_df.columns)
display(dataset.head())

   prdtypecode
0           10
1         2280
2           50
3         1280
4         2705
Index(['prdtypecode'], dtype='object')
                                         designation  \
0  Olivia: Personalisiertes Notizbuch / 150 Seite...   
1  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...   
2  Grand Stylet Ergonomique Bleu Gamepad Nintendo...   
3  Peluche Donald - Europe - Disneyland 2000 (Mar...   
4                               La Guerre Des Tuques   

                                         description   productid     imageid  \
0                                                NaN  3804725264  1263597046   
1                                                NaN   436067568  1008141237   
2  PILOT STYLE Touch Pen de marque Speedlink est ...   201115110   938777978   
3                                                NaN    50418756   457047496   
4  Luc a des id&eacute;es de grandeur. Il veut or...   278535884  1077757786   

                                image_name  \
0  imag

Unnamed: 0,designation,description,productid,imageid,image_name,image_path,prdtypecode
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,image_1263597046_product_3804725264.jpg,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...,10
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,image_1008141237_product_436067568.jpg,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...,2280
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,image_938777978_product_201115110.jpg,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...,50
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,image_457047496_product_50418756.jpg,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...,1280
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,image_1077757786_product_278535884.jpg,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...,2705


In [38]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(dataset['image_path'], dataset['prdtypecode'], test_size=0.2, random_state=42)



In [39]:
# Configuration du générateur de données - Training set
train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
                                   shear_range=0.2,
                                   zoom_range=0.1,
                                   rotation_range=10,
                                   width_shift_range=0.1,
                                   height_shift_range=0.1,
                                   horizontal_flip=True)

# Configuration du générateur de données - Validation set
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input)

# Convertir les étiquettes en chaînes pour répondre aux exigences du générateur
y_train_str = y_train.astype(str)
y_test_str = y_test.astype(str)

# Préparation des générateurs
train_generator = train_datagen.flow_from_dataframe(
    dataframe=X_train.to_frame(name='image_path').join(y_train_str.to_frame(name='prdtypecode')),
    x_col='image_path',
    y_col='prdtypecode',
    class_mode='sparse',
    target_size=(224, 224),
    batch_size=32
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=X_test.to_frame(name='image_path').join(y_test_str.to_frame(name='prdtypecode')),
    x_col='image_path',
    y_col='prdtypecode',
    class_mode='sparse',
    target_size=(224, 224),
    batch_size=32
)


Found 67932 validated image filenames belonging to 27 classes.
Found 16984 validated image filenames belonging to 27 classes.


In [40]:
# Construction du modèle EfficientNetB1
base_model = tf.keras.applications.EfficientNetB1(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))

# Gel des couches
for layer in base_model.layers:
    layer.trainable = False

# Ajout des nouvelles couches
inputs = Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(len(y_df['prdtypecode'].unique()), activation='softmax')(x)
model = Model(inputs, outputs)

# Compilation du modèle
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [41]:
#Entraînement initial du modèle (avec couches gelées)
history = model.fit(train_generator,
                    validation_data=test_generator,
                    epochs=10,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)])


Epoch 1/10


  self._warn_if_super_not_called()


[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7503s[0m 4s/step - accuracy: 0.4422 - loss: 1.9200 - val_accuracy: 0.5583 - val_loss: 1.4628
Epoch 2/10
[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48188s[0m 23s/step - accuracy: 0.5406 - loss: 1.5373 - val_accuracy: 0.5742 - val_loss: 1.4251
Epoch 3/10
[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10891s[0m 5s/step - accuracy: 0.5620 - loss: 1.4595 - val_accuracy: 0.5805 - val_loss: 1.3846
Epoch 4/10
[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9656s[0m 5s/step - accuracy: 0.5745 - loss: 1.4106 - val_accuracy: 0.5904 - val_loss: 1.3531
Epoch 5/10
[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10590s[0m 5s/step - accuracy: 0.5842 - loss: 1.3819 - val_accuracy: 0.5981 - val_loss: 1.3547
Epoch 6/10
[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10070s[0m 5s/step - accuracy: 0.5902 - loss: 1.3468 - val_accuracy: 0.5995 - val_loss: 1.3373
Epoch 7/10

In [42]:
import numpy as np

# Calcul du nombre de steps nécessaire, en convertissant le résultat en int pour éviter des erreurs de type
steps_per_epoch = int(np.ceil(len(X_train) / 32))
# Évaluation du modèle sur l'ensemble de train
loss, accuracy = model.evaluate(train_generator, steps=steps_per_epoch)
print(f"Train Loss: {loss}")
print(f"Train Accuracy: {accuracy}")

[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7809s[0m 4s/step - accuracy: 0.6946 - loss: 1.0470
Test Loss: 1.0434811115264893
Test Accuracy: 0.6958429217338562


In [44]:
import numpy as np
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score



# Calcul du nombre de steps nécessaire pour parcourir les données sans répétition
train_steps = int(np.ceil(len(X_train) / 32))
test_steps = int(np.ceil(len(X_test) / 32))

# Évaluation du modèle sur l'ensemble d'entraînement
train_loss, train_accuracy = model.evaluate(train_generator, steps=train_steps)
print(f"Training Loss: {train_loss}")
print(f"Training Accuracy: {train_accuracy}")

[1m2123/2123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7617s[0m 4s/step - accuracy: 0.6959 - loss: 1.0424
Training Loss: 1.0459259748458862
Training Accuracy: 0.6951068639755249


In [45]:
# Évaluation du modèle sur l'ensemble de test
test_loss, test_accuracy = model.evaluate(test_generator, steps=test_steps)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1449s[0m 3s/step - accuracy: 0.6063 - loss: 1.3318
Test Loss: 1.3356013298034668
Test Accuracy: 0.6003885865211487


In [46]:
# Prédiction des étiquettes sur l'ensemble de test pour calcul des métriques
test_predictions = model.predict(test_generator, steps=test_steps)
predicted_classes = np.argmax(test_predictions, axis=1)

# Obtention des vraies étiquettes de test à partir du générateur

true_classes = test_generator.classes

# Calcul de la précision, du rappel et du score F1
precision = precision_score(true_classes, predicted_classes, average='weighted')
recall = recall_score(true_classes, predicted_classes, average='weighted')
f1 = f1_score(true_classes, predicted_classes, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")



[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1685s[0m 3s/step
Precision: 0.05
Recall: 0.05
F1 Score: 0.05


In [48]:
# Évaluation du modèle sur le set de test
val_loss, val_accuracy = model.evaluate(test_generator)
print(f"Validation Accuracy: {val_accuracy*100:.2f}%")

[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1436s[0m 3s/step - accuracy: 0.5932 - loss: 1.3325
Validation Accuracy: 60.04%
