In [1]:
import pandas as pd
import numpy as np
import os, re, string
from skimage import io, color, feature, transform
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from sklearn.metrics import accuracy_score, classification_report
import mlflow
from tqdm.notebook import tqdm

In [2]:
mlflow.set_tracking_uri("../../mlruns")

In [3]:
# Définition des chemins
images_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/images/image_train"
X_csv_path = (
    "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/x_train.csv"
)
y_csv_path = (
    "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/y_train.csv"
)

# Chargement des données
X_df = pd.read_csv(X_csv_path, index_col=0)
y_df = pd.read_csv(y_csv_path, index_col=0)

X_df["text"] = np.where(
    X_df["description"].isna(),
    X_df["designation"].astype(str),
    X_df["designation"].astype(str) + " " + X_df["description"].astype(str),
)

# Réduire les données aux 5000 premières lignes
sample_X = X_df
target = y_df["prdtypecode"]

# Ajout du chemin complet des images dans sample_X
sample_X["image_path"] = sample_X.apply(
    lambda row: os.path.join(
        images_path, f"image_{row.imageid}_product_{row.productid}.jpg"
    ),
    axis=1,
)
data = sample_X[["text", "image_path"]]

In [4]:
def custom_standardization(input_data):
    """
    Custom standardization function for text data.

    Args:
        input_data: The input text data.

    Returns:
        The standardized text data.
    """
    # Decode the input HTML using UTF-8 encoding.
    decoded_html = tf.strings.unicode_decode(input_data, "UTF-8")

    # Encode the decoded HTML back into HTML for further processing.
    encoded_html = tf.strings.unicode_encode(decoded_html, "UTF-8")

    # Strip all HTML tags from the input data using a regular expression replace operation.
    stripped_html = tf.strings.regex_replace(encoded_html, "<[^>]*>", " ")

    # Convert the input text to lowercase for consistency.
    lowercase = tf.strings.lower(stripped_html)

    # Remove extra whitespace by replacing one or more spaces with a single space.
    cleaned_input_data = tf.strings.regex_replace(lowercase, r"\s+", " ")

    # Replace punctuation characters with empty strings (i.e., remove them).
    return tf.strings.regex_replace(
        cleaned_input_data, "[%s]" % re.escape(string.punctuation), ""
    )


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=100000,
    output_mode="int",
    output_sequence_length=250,
)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(sample_X["text"])

2024-05-09 11:31:58.008092: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-09 11:31:58.008111: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-09 11:31:58.008129: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-09 11:31:58.008157: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-09 11:31:58.008172: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
# Configuration pour la vectorisation du texte
max_tokens = 10000
output_sequence_length = 250
vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=output_sequence_length,
)
vectorize_layer.adapt(sample_X["description"].fillna(""))

# Appliquer la vectorisation au texte pour obtenir les vecteurs
X_text_vectors = vectorize_layer(sample_X["description"].fillna(""))

2024-05-09 10:01:09.659414: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-09 10:01:09.659445: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-09 10:01:09.659453: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-09 10:01:09.659714: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-09 10:01:09.659735: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# Fonction pour extraire les caractéristiques HOG d'une image
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image_gray = color.rgb2gray(image)
    image_resized = transform.resize(image_gray, (128, 64), anti_aliasing=True)
    hog_features = feature.hog(
        image_resized, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=False
    )
    return hog_features

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=42
)

In [7]:
# Appliquer la vectorisation au texte pour obtenir les vecteurs
X_train_text_vectors = vectorize_layer(X_train["text"])
print("X_train_text vectorised")
X_test_text_vectors = vectorize_layer(X_test["text"])
print("X_test_text vectorised")

# Préparation des caractéristiques HOG pour les images
X_train_features_images = np.array(
    [extract_hog_features(path) for path in tqdm(X_train["image_path"])]
)
X_test_features_images = np.array(
    [extract_hog_features(path) for path in tqdm(X_test["image_path"])]
)

X_train_text vectorised
X_test_text vectorised


  0%|          | 0/67932 [00:00<?, ?it/s]

  0%|          | 0/16984 [00:00<?, ?it/s]

In [8]:
mlflow.sklearn.autolog(log_datasets=False, disable=True)  # pour ne pas log
# Rééquilibrage des classes avec RandomOverSampler pour les deux types de données
# ros = RandomOverSampler(random_state=42)
ros = SMOTE(random_state=42)
X_train_resampled_text, y_resampled_text = ros.fit_resample(
    X_train_text_vectors, y_train
)
X_train_resampled_images, y_resampled_images = ros.fit_resample(
    X_train_features_images, y_train
)

In [9]:
# Concaténation des caractéristiques image et texte (sans conversion à dense si déjà en numpy array)
X_train_combined = np.hstack((X_train_resampled_images, X_train_resampled_text))
X_test_combined = np.hstack((X_test_features_images, X_test_text_vectors))

In [11]:
# Utiliser un modèle unique pour la classification sur les caractéristiques combinées
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

combined_model = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    combined_model, param_grid, cv=5, scoring="f1_weighted", verbose=3, n_jobs=-1
)


mlflow.set_experiment("features RF")
mlflow.sklearn.autolog(log_datasets=False, disable=False)
description = "SMOTE(random_state=42) sur la totlité de X_train (sans diminuer les ecarts des targets)"
with mlflow.start_run(description=description) as run:
    print("Run id:", run.info.run_id)
    grid_search.fit(
        X_train_combined, y_resampled_text
    )  # y_train_images doit être identique à y_train_text


best_params = grid_search.best_params_

print(best_params)
print(grid_search.best_score_)

Run id: 471d777b219546fc95a77aa6c782dca8
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.709 total time= 1.2min
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.700 total time= 1.2min
[CV 5/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.739 total time= 1.3min
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.743 total time= 1.3min
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.730 total time= 1.3min
[CV 2/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.695 total time= 1.0min
[CV 1/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.698 total time= 1.0min
[CV 3/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.717 total time=  53.8s
[CV 4/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.733 total time=  54.3s
[CV 5/5]

2024/05/09 12:12:19 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.


{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
0.8484076293421952


In [12]:
modele_selected = "471d777b219546fc95a77aa6c782dca8"  # ID du run du modèle sélectionné
logged_model = f"runs:/{modele_selected}/best_estimator"  # choisir "/model" pour model standard ou "/best_estimator" pour un GridSearchCV

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Prédiction et évaluation
y_pred_combined = loaded_model.predict(X_test_combined)
print("Combined Features Model Accuracy:", accuracy_score(y_test, y_pred_combined))
print("Classification Report for Combined Features Model:")
print(classification_report(y_test, y_pred_combined))
report = classification_report(y_test, y_pred_combined, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv(f"report_Features_RF_{modele_selected}.csv")

Combined Features Model Accuracy: 0.4822774375883184
Classification Report for Combined Features Model:
              precision    recall  f1-score   support

          10       0.37      0.59      0.45       612
          40       0.47      0.45      0.46       521
          50       0.35      0.24      0.28       357
          60       0.22      0.61      0.32       161
        1140       0.33      0.47      0.39       539
        1160       0.79      0.76      0.78       786
        1180       0.33      0.23      0.27       146
        1280       0.24      0.12      0.16       961
        1281       0.22      0.08      0.11       424
        1300       0.40      0.40      0.40       974
        1301       0.57      0.53      0.55       169
        1302       0.34      0.28      0.31       507
        1320       0.42      0.24      0.31       672
        1560       0.37      0.42      0.39      1013
        1920       0.69      0.70      0.69       841
        1940       0.26      0.