In [1]:
from google.colab import drive
drive.mount('/content/drive')

!unzip "/content/drive/My Drive/data.zip" -d "/content"

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  inflating: /content/data/data/train/image_train/image_941622065_product_207151527.jpg  
  inflating: /content/data/data/train/image_train/image_941622067_product_207151563.jpg  
  inflating: /content/data/data/train/image_train/image_941622071_product_207151331.jpg  
  inflating: /content/data/data/train/image_train/image_941622073_product_207151492.jpg  
  inflating: /content/data/data/train/image_train/image_941622084_product_207151508.jpg  
  inflating: /content/data/data/train/image_train/image_941622086_product_207151543.jpg  
  inflating: /content/data/data/train/image_train/image_941622094_product_207151287.jpg  
  inflating: /content/data/data/train/image_train/image_941622096_product_207151567.jpg  
  inflating: /content/data/data/train/image_train/image_941622100_product_207151555.jpg  
  inflating: /content/data/data/train/image_train/image_941622102_product_207151329.jpg  
  inflat

In [2]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle


class DataImporter:
    def __init__(self, filepath='/content/data/data'):
        self.filepath = filepath

    def load_data(self):
        data = pd.read_csv(f'{self.filepath}/X_train_update.csv')
        data['description'] = data['designation'] + str(data['description'])
        data = data.drop(['Unnamed: 0', 'designation'], axis=1)

        target = pd.read_csv(f'{self.filepath}/Y_train_CVw08PX.csv')
        target = target.drop(['Unnamed: 0'], axis=1)
        modalite_mapping = {modalite: i for i, modalite in enumerate(target['prdtypecode'].unique())}
        target['prdtypecode'] = target['prdtypecode'].replace(modalite_mapping)

        with open("/content/mapper.pkl", "wb") as fichier:
            pickle.dump(modalite_mapping, fichier)

        df = pd.concat([data, target], axis=1)

        return df

    def split_train_test(self, df, samples_per_class = 600):

        grouped_data = df.groupby('prdtypecode')

        X_train_samples = []
        X_test_samples = []

        for _, group in grouped_data:
            samples = group.sample(n=samples_per_class, random_state=42)
            X_train_samples.append(samples)

            remaining_samples = group.drop(samples.index)
            X_test_samples.append(remaining_samples)

        X_train = pd.concat(X_train_samples)
        X_test = pd.concat(X_test_samples)

        X_train = X_train.sample(frac=1, random_state=42).reset_index(drop=True)
        X_test = X_test.sample(frac=1, random_state=42).reset_index(drop=True)

        y_train = X_train['prdtypecode']
        X_train = X_train.drop(['prdtypecode'], axis=1)

        y_test = X_test['prdtypecode']
        X_test = X_test.drop(['prdtypecode'], axis=1)

        val_samples_per_class = 50

        grouped_data_test = pd.concat([X_test, y_test], axis=1).groupby('prdtypecode')

        X_val_samples = []
        y_val_samples = []

        for _, group in grouped_data_test:
            samples = group.sample(n=val_samples_per_class, random_state=42)
            X_val_samples.append(samples[['description', 'productid', 'imageid']])
            y_val_samples.append(samples['prdtypecode'])

        X_val = pd.concat(X_val_samples)
        y_val = pd.concat(y_val_samples)

        X_val = X_val.sample(frac=1, random_state=42).reset_index(drop=True)
        y_val = y_val.sample(frac=1, random_state=42).reset_index(drop=True)

        return X_train, X_val, X_test, y_train, y_val, y_test

In [3]:
class ImagePreprocessor:
    def __init__(self, filepath='/content/data/data'):
        self.filepath = filepath

    def preprocess_images_in_df(self, df):
        df['image_path'] =  f"{self.filepath}/image_train/image_" + df['imageid'].astype(str) + "_product_" + df['productid'].astype(str) + '.jpg'


class TextPreprocessor:
    def __init__(self):
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('french'))  # Vous pouvez choisir une autre langue si nécessaire

    def preprocess_text(self, text):
        # Supprimer les balises HTML
        text = BeautifulSoup(text, 'html.parser').get_text()

        # Supprimer les caractères non alphabétiques
        text = re.sub(r'[^a-zA-Z]', ' ', text)

        # Tokenization
        words = word_tokenize(text.lower())

        # Suppression des stopwords et lemmatisation
        filtered_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]

        return ' '.join(filtered_words[:10])

    def preprocess_text_in_df(self, df, columns):
        for column in columns:
            df[column] = df[column].apply(self.preprocess_text)



In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

class TextLSTMModel:
    def __init__(self, max_words=10000, max_sequence_length=10):
        self.max_words = max_words
        self.max_sequence_length = max_sequence_length
        self.tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
        self.model = None

    def preprocess_and_fit(self, X_train, y_train, X_val, y_val):
        self.tokenizer.fit_on_texts(X_train['description'])

        tokenizer_config = self.tokenizer.to_json()
        with open('/content/tokenizer_config.json', 'w', encoding='utf-8') as json_file:
          json_file.write(tokenizer_config)

        train_sequences = self.tokenizer.texts_to_sequences(X_train['description'])
        train_padded_sequences = pad_sequences(train_sequences, maxlen=self.max_sequence_length, padding='post', truncating='post')

        val_sequences = self.tokenizer.texts_to_sequences(X_val['description'])
        val_padded_sequences = pad_sequences(val_sequences, maxlen=self.max_sequence_length, padding='post', truncating='post')

        text_input = Input(shape=(self.max_sequence_length,))
        embedding_layer = Embedding(input_dim=self.max_words, output_dim=128)(text_input)
        lstm_layer = LSTM(128)(embedding_layer)
        output = Dense(27, activation='softmax')(lstm_layer)

        self.model = Model(inputs=[text_input], outputs=output)

        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        lstm_callbacks = [ModelCheckpoint(filepath='/content/best_lstm_model.h5', save_best_only=True),  # Enregistre le meilleur modèle
        EarlyStopping(patience=3, restore_best_weights=True),  # Arrête l'entraînement si la performance ne s'améliore pas
        TensorBoard(log_dir='/content/logs')  # Enregistre les journaux pour TensorBoard
        ]

        self.model.fit(
            [train_padded_sequences],
            tf.keras.utils.to_categorical(y_train, num_classes=27),
            epochs=100,
            batch_size=32,
            validation_data=([val_padded_sequences], tf.keras.utils.to_categorical(y_val, num_classes=27)),
            callbacks=lstm_callbacks
        )


In [5]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
import pandas as pd

class ImageVGG16Model:
    def __init__(self):
        self.model = None

    def preprocess_and_fit(self, X_train, y_train, X_val, y_val):
        # Paramètres
        batch_size = 32
        num_classes = 27

        df_train = pd.concat([X_train, y_train.astype(str)], axis=1)
        df_val = pd.concat([X_val, y_val.astype(str)], axis=1)

        # Créer un générateur d'images pour le set d'entraînement
        train_datagen = ImageDataGenerator()  # Normalisation des valeurs de pixel
        train_generator = train_datagen.flow_from_dataframe(
            dataframe=df_train,
            x_col='image_path',
            y_col='prdtypecode',
            target_size=(224, 224),  # Adapter à la taille d'entrée de VGG16
            batch_size=batch_size,
            class_mode='categorical',  # Utilisez 'categorical' pour les entiers encodés en one-hot
            shuffle=True
        )

        # Créer un générateur d'images pour le set de validation
        val_datagen = ImageDataGenerator()  # Normalisation des valeurs de pixel
        val_generator = val_datagen.flow_from_dataframe(
            dataframe=df_val,
            x_col='image_path',
            y_col='prdtypecode',
            target_size=(224, 224),
            batch_size=batch_size,
            class_mode='categorical',
            shuffle=False  # Pas de mélange pour le set de validation
        )

        image_input = Input(shape=(224, 224, 3))  # Adjust input shape according to your images

        vgg16_base = VGG16(include_top=False, weights='imagenet', input_tensor=image_input)

        x = vgg16_base.output
        x = Flatten()(x)
        x = Dense(256, activation='relu')(x)  # Add some additional layers if needed
        output = Dense(num_classes, activation='softmax')(x)

        self.model = Model(inputs=vgg16_base.input, outputs=output)

        for layer in vgg16_base.layers:
            layer.trainable = False

        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        vgg_callbacks = [ModelCheckpoint(filepath='/content/best_vgg16_model.h5', save_best_only=True),  # Enregistre le meilleur modèle
        EarlyStopping(patience=3, restore_best_weights=True),  # Arrête l'entraînement si la performance ne s'améliore pas
        TensorBoard(log_dir='/content/logs')  # Enregistre les journaux pour TensorBoard
        ]

        self.model.fit(
            train_generator,
            epochs=100,
            validation_data=val_generator,
            callbacks=vgg_callbacks
        )


In [6]:



import pandas as pd
from sklearn.utils import resample
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from sklearn.metrics import accuracy_score
import numpy as np

class concatenate:
    def __init__(self, tokenizer, lstm, vgg16):
        self.tokenizer = tokenizer
        self.lstm = lstm
        self.vgg16 = vgg16

    def preprocess_image(self, image_path, target_size):
        img = load_img(image_path, target_size=target_size)
        img_array = img_to_array(img)
        img_array = preprocess_input(img_array)
        return img_array

    def predict(self, X_train, y_train, new_samples_per_class = 50, max_sequence_length = 10):
        num_classes = 27

        new_X_train = pd.DataFrame(columns=X_train.columns)
        new_y_train = pd.DataFrame(columns=[0])  # Créez la structure pour les étiquettes

        # Boucle à travers chaque classe
        for class_label in range(num_classes):
            # Indices des échantillons appartenant à la classe actuelle
            indices = np.where(y_train == class_label)[0]

            # Sous-échantillonnage aléatoire pour sélectionner 'new_samples_per_class' échantillons
            sampled_indices = resample(indices, n_samples=new_samples_per_class, replace=False, random_state=42)

            # Ajout des échantillons sous-échantillonnés et de leurs étiquettes aux DataFrames
            new_X_train = pd.concat([new_X_train, X_train.loc[sampled_indices]])
            new_y_train = pd.concat([new_y_train, y_train.loc[sampled_indices]])

        # Réinitialiser les index des DataFrames
        new_X_train = new_X_train.reset_index(drop=True)
        new_y_train = new_y_train.reset_index(drop=True)
        new_y_train = new_y_train.values.reshape(1350).astype('int')

        # Charger les modèles préalablement sauvegardés
        tokenizer = self.tokenizer
        lstm_model = self.lstm
        vgg16_model = self.vgg16

        train_sequences = tokenizer.texts_to_sequences(new_X_train['description'])
        train_padded_sequences = pad_sequences(train_sequences, maxlen=10, padding='post', truncating='post')

        # Paramètres pour le prétraitement des images
        target_size = (224, 224, 3)  # Taille cible pour le modèle VGG16, ajustez selon vos besoins

        images_train = new_X_train['image_path'].apply(lambda x: self.preprocess_image(x, target_size))

        images_train = tf.convert_to_tensor(images_train.tolist(), dtype=tf.float32)

        lstm_proba = lstm_model.predict([train_padded_sequences])

        vgg16_proba = vgg16_model.predict([images_train])

        return lstm_proba, vgg16_proba, new_y_train

    def optimize(self, lstm_proba, vgg16_proba, y_train):
        # Recherche des poids optimaux en utilisant la validation croisée
        best_weights = None
        best_accuracy = 0.0

        for lstm_weight in np.linspace(0, 1, 101):  # Essayer différents poids pour LSTM
            vgg16_weight = 1.0 - lstm_weight  # Le poids total doit être égal à 1

            combined_predictions = (lstm_weight * lstm_proba) + (vgg16_weight * vgg16_proba)
            final_predictions = np.argmax(combined_predictions, axis=1)
            accuracy = accuracy_score(y_train, final_predictions)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_weights = (lstm_weight, vgg16_weight)

        return best_weights

In [8]:
import tensorflow as tf
from tensorflow import keras
import json


data_importer = DataImporter()
df = data_importer.load_data()
X_train, X_val, _, y_train, y_val, _ = data_importer.split_train_test(df)

# Preprocess text and images
text_preprocessor = TextPreprocessor()
image_preprocessor = ImagePreprocessor()
text_preprocessor.preprocess_text_in_df(X_train, columns=['description'])
text_preprocessor.preprocess_text_in_df(X_val, columns=['description'])
image_preprocessor.preprocess_images_in_df(X_train)
image_preprocessor.preprocess_images_in_df(X_val)

# Train LSTM model
text_lstm_model = TextLSTMModel()
text_lstm_model.preprocess_and_fit(X_train, y_train, X_val, y_val)

# Train VGG16 model
image_vgg16_model = ImageVGG16Model()
image_vgg16_model.preprocess_and_fit(X_train, y_train, X_val, y_val)

with open('/content/tokenizer_config.json', 'r', encoding='utf-8') as json_file:
    tokenizer_config = json_file.read()
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
    tokenizer_config
)
lstm = keras.models.load_model('/content/best_lstm_model.h5')
vgg16 = keras.models.load_model('/content/best_vgg16_model.h5')

model_concatenate = concatenate(tokenizer, lstm, vgg16)
lstm_proba, vgg16_proba, new_y_train = model_concatenate.predict(X_train, y_train)
best_weights = model_concatenate.optimize(lstm_proba, vgg16_proba, new_y_train)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Epoch 1/100
Epoch 2/100
  1/507 [..............................] - ETA: 7s - loss: 1.1437 - accuracy: 0.6562

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Found 16200 validated image filenames belonging to 27 classes.
Found 1350 validated image filenames belonging to 27 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/100

  saving_api.save_model(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [9]:
with open("/content/best_weights.pkl", "wb") as fichier:
    pickle.dump(best_weights, fichier)

num_classes = 27

proba_lstm = keras.layers.Input(shape=(num_classes,))
proba_vgg16 = keras.layers.Input(shape=(num_classes,))

weighted_proba = keras.layers.Lambda(lambda x: best_weights[0] * x[0] + best_weights[1] * x[1])([proba_lstm, proba_vgg16])

concatenate_model = keras.models.Model(inputs=[proba_lstm, proba_vgg16], outputs=weighted_proba)

# Enregistrer le modèle au format h5
concatenate_model.save('/content/concatenate.h5')

  saving_api.save_model(
