In [13]:
import pandas as pd
import numpy as np
import os
from skimage import io, color, feature, transform
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from sklearn.metrics import accuracy_score, classification_report
import mlflow

In [14]:
mlflow.set_tracking_uri("../../mlruns")
mlflow.set_experiment("features RF")
mlflow.sklearn.autolog(log_datasets=False)

In [15]:
# Définition des chemins
images_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/raw/images/image_train"
X_csv_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/processed/X_train_update (komla).csv"
y_csv_path = "/Users/jeremyrava/Documents/01 - Projets/fev24_bds_rakuten/data/processed/Y_train_CVw08PX (komla).csv"

# Chargement des données
X_df = pd.read_csv(X_csv_path)
y_df = pd.read_csv(y_csv_path)

# Réduire les données aux 5000 premières lignes
sample_X = X_df
sample_y = y_df["prdtypecode"]

# Ajout du chemin complet des images dans sample_X
sample_X["image_path"] = sample_X.apply(
    lambda row: os.path.join(
        images_path, f"image_{row.imageid}_product_{row.productid}.jpg"
    ),
    axis=1,
)

In [16]:
# Configuration pour la vectorisation du texte
max_tokens = 10000
output_sequence_length = 250
vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=output_sequence_length,
)
vectorize_layer.adapt(sample_X["description"].fillna(""))

# Appliquer la vectorisation au texte pour obtenir les vecteurs
X_text_vectors = vectorize_layer(sample_X["description"].fillna(""))

In [17]:
# Fonction pour extraire les caractéristiques HOG d'une image
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image_gray = color.rgb2gray(image)
    image_resized = transform.resize(image_gray, (128, 64), anti_aliasing=True)
    hog_features = feature.hog(
        image_resized, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=False
    )
    return hog_features

In [18]:
# Préparation des caractéristiques HOG pour les images
features_images = np.array(
    [extract_hog_features(path) for path in sample_X["image_path"]]
)

In [20]:
# Rééquilibrage des classes avec RandomOverSampler pour les deux types de données
# ros = RandomOverSampler(random_state=42)
ros = SMOTE(random_state=42)
X_resampled_images, y_resampled_images = ros.fit_resample(features_images, sample_y)
X_resampled_text, y_resampled_text = ros.fit_resample(X_text_vectors, sample_y)

2024/05/08 17:57:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2b5bb69c0fc64be6bee0d478c1c8f946', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/08 17:57:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '79c643c6ab344632b6923fbb2ff91323', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/08 17:57:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '62075c38a52d4c81b0fd7b4f16ccae29', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/08 17:57:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd4e5efeb93fd485c805441de34fb7a14', which will track hyperparameters, performance metrics, model artifacts, and lineage i

In [21]:
# Division des données rééquilibrées en ensembles d'entraînement et de test pour les images et le texte
X_train_images, X_test_images, y_train_images, y_test_images = train_test_split(
    X_resampled_images, y_resampled_images, test_size=0.2, random_state=42
)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_resampled_text, y_resampled_text, test_size=0.2, random_state=42
)

In [22]:
# Concaténation des caractéristiques image et texte (sans conversion à dense si déjà en numpy array)
X_train_combined = np.hstack((X_train_images, X_train_text))
X_test_combined = np.hstack((X_test_images, X_test_text))

In [23]:
# Utiliser un modèle unique pour la classification sur les caractéristiques combinées
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

combined_model = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None, 5, 8],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    combined_model, param_grid, cv=5, scoring="f1_weighted", verbose=3, n_jobs=-1
)

with mlflow.start_run(description="SMOTE(42)") as run:
    print("Run id:", run.info.run_id)
    grid_search.fit(
        X_train_combined, y_train_images
    )  # y_train_images doit être identique à y_train_text


best_params = grid_search.best_params_

print(best_params)
print(grid_search.best_score_)

Run id: 2f235b11abd34080bb5b48073a6d7033
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.697 total time= 1.5min
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.695 total time= 1.7min
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.696 total time= 1.7min
[CV 5/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.698 total time= 1.7min
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=10;, score=0.697 total time= 1.7min
[CV 2/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.689 total time= 1.0min
[CV 1/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.685 total time= 1.1min
[CV 4/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.682 total time= 1.3min
[CV 3/5] END max_depth=None, min_samples_split=5, n_estimators=10;, score=0.689 total time= 1.3min
[CV 5/

2024/05/08 19:30:54 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
0.8189696475041511


In [21]:
y_pred_test = combined_model.predict(X_test_combined)
report = classification_report(y_test_images, y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv("report_Features_RF.csv")

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Prédiction et évaluation
y_pred_combined = combined_model.predict(X_test_combined)
print(
    "Combined Features Model Accuracy:", accuracy_score(y_test_images, y_pred_combined)
)
print("Classification Report for Combined Features Model:")
print(classification_report(y_test_images, y_pred_combined))

Combined Features Model Accuracy: 0.9251743817374762
Classification Report for Combined Features Model:
              precision    recall  f1-score   support

          10       0.93      0.95      0.94       120
          40       0.95      0.98      0.97       113
          50       0.96      1.00      0.98       129
          60       0.95      1.00      0.97       112
        1140       0.95      0.93      0.94       117
        1160       0.97      0.93      0.95       122
        1180       1.00      1.00      1.00       125
        1280       0.89      0.84      0.86       128
        1281       0.98      1.00      0.99       127
        1300       0.88      0.83      0.85       109
        1301       1.00      1.00      1.00       123
        1302       0.92      0.94      0.93       122
        1320       0.97      0.98      0.97       113
        1560       0.79      0.73      0.76       119
        1920       0.90      0.95      0.93       118
        1940       0.98      1.

In [24]:
logged_model = "runs:/2f235b11abd34080bb5b48073a6d7033/best_estimator"

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Prédiction et évaluation
y_pred_combined = loaded_model.predict(X_test_combined)
print(
    "Combined Features Model Accuracy:", accuracy_score(y_test_images, y_pred_combined)
)
print("Classification Report for Combined Features Model:")
print(classification_report(y_test_images, y_pred_combined))

Combined Features Model Accuracy: 0.849643563278855
Classification Report for Combined Features Model:
              precision    recall  f1-score   support

          10       0.79      0.90      0.84      2010
          40       0.89      0.85      0.87      2026
          50       0.92      0.89      0.91      2029
          60       0.90      0.99      0.94      2059
        1140       0.81      0.90      0.85      2003
        1160       0.93      0.92      0.93      2059
        1180       0.96      0.98      0.97      2000
        1280       0.74      0.55      0.63      1970
        1281       0.85      0.83      0.84      2100
        1300       0.78      0.71      0.74      2099
        1301       0.97      0.97      0.97      1997
        1302       0.83      0.83      0.83      2104
        1320       0.89      0.74      0.81      2022
        1560       0.71      0.71      0.71      2031
        1920       0.90      0.85      0.87      2045
        1940       0.85      0.9