In [1]:
import pandas as pd
import numpy as np
import os, re, string
from skimage import io, color, feature, transform
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from sklearn.metrics import accuracy_score, classification_report
import mlflow
from tqdm.notebook import tqdm

In [2]:
mlflow.set_tracking_uri("../../mlruns")

In [3]:
# Définition des chemins
images_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/images/image_train"
X_csv_path = (
    "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/x_train.csv"
)
y_csv_path = (
    "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/y_train.csv"
)

# Chargement des données
X_df = pd.read_csv(X_csv_path, index_col=0)
y_df = pd.read_csv(y_csv_path, index_col=0)

X_df["text"] = np.where(
    X_df["description"].isna(),
    X_df["designation"].astype(str),
    X_df["designation"].astype(str) + " " + X_df["description"].astype(str),
)

# Réduire les données aux 5000 premières lignes
sample_X = X_df
target = y_df["prdtypecode"]

# Ajout du chemin complet des images dans sample_X
sample_X["image_path"] = sample_X.apply(
    lambda row: os.path.join(
        images_path, f"image_{row.imageid}_product_{row.productid}.jpg"
    ),
    axis=1,
)
data = sample_X[["text", "image_path"]]

In [4]:
def custom_standardization(input_data):
    """
    Custom standardization function for text data.

    Args:
        input_data: The input text data.

    Returns:
        The standardized text data.
    """
    # Decode the input HTML using UTF-8 encoding.
    decoded_html = tf.strings.unicode_decode(input_data, "UTF-8")

    # Encode the decoded HTML back into HTML for further processing.
    encoded_html = tf.strings.unicode_encode(decoded_html, "UTF-8")

    # Strip all HTML tags from the input data using a regular expression replace operation.
    stripped_html = tf.strings.regex_replace(encoded_html, "<[^>]*>", " ")

    # Convert the input text to lowercase for consistency.
    lowercase = tf.strings.lower(stripped_html)

    # Remove extra whitespace by replacing one or more spaces with a single space.
    cleaned_input_data = tf.strings.regex_replace(lowercase, r"\s+", " ")

    # Replace punctuation characters with empty strings (i.e., remove them).
    return tf.strings.regex_replace(
        cleaned_input_data, "[%s]" % re.escape(string.punctuation), ""
    )


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=100000,
    output_mode="int",
    output_sequence_length=250,
)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(sample_X["text"])

2024-05-10 09:54:34.804363: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-10 09:54:34.804386: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-10 09:54:34.804393: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-10 09:54:34.804616: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-10 09:54:34.804636: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
# Fonction pour extraire les caractéristiques HOG d'une image
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image_gray = color.rgb2gray(image)
    image_resized = transform.resize(image_gray, (128, 64), anti_aliasing=True)
    hog_features = feature.hog(
        image_resized, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=False
    )
    return hog_features

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=42
)

In [8]:
# Appliquer la vectorisation au texte pour obtenir les vecteurs
X_train_text_vectors = vectorize_layer(X_train["text"])
print("X_train_text vectorised")
X_test_text_vectors = vectorize_layer(X_test["text"])
print("X_test_text vectorised")

# Préparation des caractéristiques HOG pour les images
X_train_features_images = np.array(
    [extract_hog_features(path) for path in tqdm(X_train["image_path"])]
)
X_test_features_images = np.array(
    [extract_hog_features(path) for path in tqdm(X_test["image_path"])]
)

X_train_text vectorised
X_test_text vectorised


  0%|          | 0/67932 [00:00<?, ?it/s]

  0%|          | 0/16984 [00:00<?, ?it/s]

In [9]:
mlflow.sklearn.autolog(log_datasets=False, disable=True)  # pour ne pas log

reequilibrage = False

if reequilibrage:
    # Rééquilibrage des classes avec RandomOverSampler pour les deux types de données
    # ros = RandomOverSampler(random_state=42)
    ros = SMOTE(random_state=42)
    X_train_resampled_text, y_resampled_text = ros.fit_resample(
        X_train_text_vectors, y_train
    )
    X_train_resampled_images, y_resampled_images = ros.fit_resample(
        X_train_features_images, y_train
    )
else:
    X_train_resampled_text = X_train_text_vectors
    X_train_resampled_images = X_train_features_images
    y_resampled_text = y_train
    y_resampled_images = y_train

In [10]:
# Concaténation des caractéristiques image et texte (sans conversion à dense si déjà en numpy array)
X_train_combined = np.hstack((X_train_resampled_images, X_train_resampled_text))
X_test_combined = np.hstack((X_test_features_images, X_test_text_vectors))

In [11]:
# Utiliser un modèle unique pour la classification sur les caractéristiques combinées
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

combined_model = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    combined_model, param_grid, cv=5, scoring="f1_weighted", verbose=3, n_jobs=-1
)


mlflow.set_experiment("features RF")
mlflow.sklearn.autolog(log_datasets=False, disable=False)
description = "SMOTE(random_state=42) sur la totlité de X_train (sans diminuer les ecarts des targets)"
with mlflow.start_run(description=description) as run:
    print("Run id:", run.info.run_id)
    grid_search.fit(
        X_train_combined, y_resampled_text
    )  # y_train_images doit être identique à y_train_text


best_params = grid_search.best_params_

print(best_params)
print(grid_search.best_score_)

Run id: 7436e7b427ac4caaae6d76fa5df756b3
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.380 total time=  11.6s
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.384 total time=  11.6s
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.381 total time=  11.6s
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.377 total time=  11.8s
[CV 5/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.383 total time=  12.0s
[CV 1/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.377 total time=  10.2s
[CV 2/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.379 total time=  10.2s
[CV 3/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.383 total time=   9.8s
[CV 4/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.377 total time=   9.9s
[CV 5/5]



[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.448 total time= 1.7min
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.438 total time= 1.7min
[CV 5/5] END max_depth=None, min_samples_split=10, n_estimators=10;, score=0.378 total time=   9.4s
[CV 1/5] END max_depth=None, min_samples_split=5, n_estimators=100;, score=0.434 total time= 1.6min
[CV 2/5] END max_depth=None, min_samples_split=5, n_estimators=100;, score=0.431 total time= 1.6min
[CV 1/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.417 total time=  46.4s
[CV 2/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.416 total time=  47.4s
[CV 3/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.424 total time=  47.4s
[CV 5/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.425 total time=  46.1s
[CV 4/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.417 total time=  47.4s


2024/05/10 10:11:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
0.440271164327161


In [12]:
modele_selected = "7436e7b427ac4caaae6d76fa5df756b3"  # ID du run du modèle sélectionné
logged_model = f"runs:/{modele_selected}/best_estimator"  # choisir "/model" pour model standard ou "/best_estimator" pour un GridSearchCV

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Prédiction et évaluation
y_pred_combined = loaded_model.predict(X_test_combined)
print("Combined Features Model Accuracy:", accuracy_score(y_test, y_pred_combined))
print("Classification Report for Combined Features Model:")
print(classification_report(y_test, y_pred_combined))
report = classification_report(y_test, y_pred_combined, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv(f"report_Features_RF_{modele_selected}.csv")

Combined Features Model Accuracy: 0.4774493641073952
Classification Report for Combined Features Model:
              precision    recall  f1-score   support

          10       0.41      0.48      0.44       612
          40       0.50      0.37      0.42       521
          50       0.85      0.14      0.24       357
          60       0.60      0.33      0.42       161
        1140       0.44      0.37      0.40       539
        1160       0.72      0.77      0.74       786
        1180       0.76      0.09      0.16       146
        1280       0.26      0.17      0.21       961
        1281       0.38      0.03      0.06       424
        1300       0.39      0.44      0.41       974
        1301       0.92      0.29      0.44       169
        1302       0.59      0.15      0.23       507
        1320       0.53      0.17      0.26       672
        1560       0.35      0.43      0.38      1013
        1920       0.69      0.68      0.68       841
        1940       0.81      0.