In [1]:
import pandas as pd
import numpy as np
import os
from skimage import io, color, feature, transform
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import load_model
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from joblib import load
import mlflow

In [2]:
mlflow.set_tracking_uri("../../mlruns")
mlflow.set_experiment("features RF")
mlflow.sklearn.autolog(log_datasets=False)

2024/05/05 16:59:09 INFO mlflow.tracking.fluent: Experiment with name 'features RF' does not exist. Creating a new experiment.


In [3]:
# Définition des chemins
images_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/images/image_train"
X_csv_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/processed/X_train_update (komla).csv"
y_csv_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/processed/Y_train_CVw08PX (komla).csv"

# Chargement des données
X_df = pd.read_csv(X_csv_path)
y_df = pd.read_csv(y_csv_path)

# Réduire les données aux 5000 premières lignes
sample_X = X_df
sample_y = y_df["prdtypecode"]

# Ajout du chemin complet des images dans sample_X
sample_X["image_path"] = sample_X.apply(
    lambda row: os.path.join(
        images_path, f"image_{row.imageid}_product_{row.productid}.jpg"
    ),
    axis=1,
)

In [4]:
# Configuration pour la vectorisation du texte
max_tokens = 10000
output_sequence_length = 250
vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=output_sequence_length,
)
vectorize_layer.adapt(sample_X["description"].fillna(""))

# Appliquer la vectorisation au texte pour obtenir les vecteurs
X_text_vectors = vectorize_layer(sample_X["description"].fillna(""))

2024-05-05 16:59:16.298448: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-05 16:59:16.298469: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-05 16:59:16.298475: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-05 16:59:16.298704: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-05 16:59:16.298717: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# Fonction pour extraire les caractéristiques HOG d'une image
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image_gray = color.rgb2gray(image)
    image_resized = transform.resize(image_gray, (128, 64), anti_aliasing=True)
    hog_features = feature.hog(
        image_resized, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=False
    )
    return hog_features

In [6]:
# Préparation des caractéristiques HOG pour les images
features_images = np.array(
    [extract_hog_features(path) for path in sample_X["image_path"]]
)

In [7]:
# Rééquilibrage des classes avec RandomOverSampler pour les deux types de données
ros = RandomOverSampler(random_state=42)
X_resampled_images, y_resampled_images = ros.fit_resample(features_images, sample_y)
X_resampled_text, y_resampled_text = ros.fit_resample(X_text_vectors, sample_y)

In [8]:
# Division des données rééquilibrées en ensembles d'entraînement et de test pour les images et le texte
X_train_images, X_test_images, y_train_images, y_test_images = train_test_split(
    X_resampled_images, y_resampled_images, test_size=0.2, random_state=42
)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_resampled_text, y_resampled_text, test_size=0.2, random_state=42
)

In [9]:
# Concaténation des caractéristiques image et texte (sans conversion à dense si déjà en numpy array)
X_train_combined = np.hstack((X_train_images, X_train_text))
X_test_combined = np.hstack((X_test_images, X_test_text))

In [10]:
# Utiliser un modèle unique pour la classification sur les caractéristiques combinées
from sklearn.ensemble import RandomForestClassifier

combined_model = RandomForestClassifier(random_state=42)
with mlflow.start_run() as run:
    print("Run id:", run.info.run_id)
    combined_model.fit(
        X_train_combined, y_train_images
    )  # y_train_images doit être identique à y_train_text

Run id: 3ed6f6fd6971442db10cff63789ff786




In [11]:
y_pred_test = combined_model.predict(X_test_combined)
report = classification_report(y_test_images, y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv("report_Features_RF.csv")

In [11]:
# Prédiction et évaluation
y_pred_combined = combined_model.predict(X_test_combined)
print(
    "Combined Features Model Accuracy:", accuracy_score(y_test_images, y_pred_combined)
)
print("Classification Report for Combined Features Model:")
print(classification_report(y_test_images, y_pred_combined))

Combined Features Model Accuracy: 0.9251743817374762
Classification Report for Combined Features Model:
              precision    recall  f1-score   support

          10       0.93      0.95      0.94       120
          40       0.95      0.98      0.97       113
          50       0.96      1.00      0.98       129
          60       0.95      1.00      0.97       112
        1140       0.95      0.93      0.94       117
        1160       0.97      0.93      0.95       122
        1180       1.00      1.00      1.00       125
        1280       0.89      0.84      0.86       128
        1281       0.98      1.00      0.99       127
        1300       0.88      0.83      0.85       109
        1301       1.00      1.00      1.00       123
        1302       0.92      0.94      0.93       122
        1320       0.97      0.98      0.97       113
        1560       0.79      0.73      0.76       119
        1920       0.90      0.95      0.93       118
        1940       0.98      1.