In [10]:
#transformation du texte sous forme de liste en string
def jonction(liste):
  result = ""
  for token in liste:
    result = result + token + " "
  return result

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from joblib import load,dump
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
import time
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def optimisation_model(df_post_traduction,colonne_a_vectoriser,max_features,k_best,joblib_path_suivi_metrique,grid_search_cv):
    #import du df
    df = load(df_post_traduction)
    #transformation de la future colonne feature en chaine de caractère
    df[colonne_a_vectoriser]=df[colonne_a_vectoriser].apply(jonction)
    #séparation des features et de la cible
    data = df[colonne_a_vectoriser]
    target = df["prdtypecode"]
    #séparation de train et test
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, stratify = target)
    #vectorisation selon un nombre max de mots les plus fréquents
    vectorizer = CountVectorizer(max_features=max_features)
    X_train = vectorizer.fit_transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()
    scaler = StandardScaler().fit(np.asarray(X_train))
    X_train = scaler.transform(np.asarray(X_train))
    X_test = scaler.transform(np.asarray(X_test))
    sel = SelectKBest(k=k_best)
    sel.fit(np.asarray(X_train),y_train)
    X_train = sel.transform(np.asarray(X_train))
    X_test = sel.transform(np.asarray(X_test))
    if os.path.exists(joblib_path_suivi_metrique):
        df_import = load(joblib_path_suivi_metrique)
        print("récupération du df existant")
    else:
        df_import = pd.DataFrame(columns=["Max_features","K_best","Grid","Hyperparamètres","Best_param", "Accuracy", "F1_weighted", "F1_macro", "Duree en sec"])
        print("création d'un dataframe")
    score = []
    print("debut du grid:",grid_search_cv)
    debut = time.time()
    grid_search_cv.fit(np.asarray(X_train),y_train)
    print("fin du grid:",grid_search_cv)
    print("debut meilleur param:",grid_search_cv.best_params_)
    best_params = grid_search_cv.best_params_
    best_rf_model = RandomForestClassifier(**best_params)
    best_rf_model.fit(np.asarray(X_train), y_train)
    y_pred  = best_rf_model.predict(np.asarray(X_test))
    print("fin meilleur param")
    accuracy = accuracy_score(y_test, y_pred)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    fin = time.time()
    duree = fin - debut
    model_scores = { 
            "Max_features": f"{max_features}",
            "K_best":f"{k_best}",
            "Grid": f"{grid_search_cv}",
            "Hyperparamètres":f"{param_grid}",
            "Best_param":f"{best_params}",
            "Accuracy": accuracy,
            "F1_weighted": f1_weighted,
            "F1_macro": f1_macro,
            "Duree en sec": duree}
    score.append(model_scores)
    df_score = pd.DataFrame(score)
    df = pd.concat([df_import, df_score], ignore_index=True)
    return df

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rdf = RandomForestClassifier(n_jobs = -1,random_state = 23)

param_grid = {
    'n_estimators': [50, 100, 200, 300],  # Nombre d'arbres dans la forêt
    'criterion': ["log_loss"],  # Métrique de qualité de la division
    'max_depth': [None],  # Profondeur maximale de chaque arbre
    'min_samples_split': [5],  # Nombre minimal d'échantillons requis pour diviser un nœud
    'min_samples_leaf': [1],  # Nombre minimal d'échantillons requis pour être une feuille
    'max_features': ['log2'],  # Nombre maximal de fonctionnalités à considérer pour la division
    'bootstrap': [False],  # Si l'échantillonnage bootstrap doit être utilisé
    'class_weight': ['balanced'],  # Poids des classes pour la correction de déséquilibre
    'random_state': [23],  # Seed pour la reproductibilité
    'n_jobs': [-1]  # Utilisation de tous les cœurs de CPU}
}

gcv = GridSearchCV(estimator=rdf, param_grid = param_grid,cv=5, n_jobs=-1, scoring="f1_weighted")




In [None]:
#param_grid = {
    'n_estimators': [100, 200, 300],  # Nombre d'arbres 
    'criterion': ['gini', 'entropy',],  # Métrique de qualité de la division
    'max_depth': [None, 10, 20, 30],  # Profondeur maximale de chaque arbre
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons requis pour diviser un nœud
    'min_samples_leaf': [1, 2, 4],  # Nombre minimal d'échantillons requis pour être une feuille
    'max_features': ['auto', 'sqrt', 'log2'],  # Nombre maximal de fonctionnalités à considérer pour la division
    'bootstrap': [True, False],  # Si l'échantillonnage bootstrap doit être utilisé
    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Poids des classes pour la correction de déséquilibre
    'random_state': [23],  # Seed pour la reproductibilité
    'n_jobs': [-1]  # Utilisation de tous les cœurs de CPU}

In [19]:
df_post_traduction = r"C:\Users\franc\AutoML\df_train_post_trad.joblib"
colonne_a_vectoriser = "mots_stem_sans_chiffres"
max_features = 4000
k_best = 3000
joblib_path_suivi_metrique = r"C:\Users\franc\AutoML\df_optimisation.joblib"
grid_search_cv = gcv
df_optimisation = optimisation_model(df_post_traduction,colonne_a_vectoriser,max_features,k_best,joblib_path_suivi_metrique,grid_search_cv)
dump(df_optimisation,r"C:\Users\franc\AutoML\df_optimisation.joblib")

récupération du df existant
debut du grid: GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=23),
             n_jobs=-1,
             param_grid={'bootstrap': [False], 'class_weight': ['balanced'],
                         'criterion': ['log_loss'], 'max_depth': [None],
                         'max_features': ['log2'], 'min_samples_leaf': [1],
                         'min_samples_split': [5],
                         'n_estimators': [50, 100, 200, 300], 'n_jobs': [-1],
                         'random_state': [23]},
             scoring='f1_weighted')
fin du grid: GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=23),
             n_jobs=-1,
             param_grid={'bootstrap': [False], 'class_weight': ['balanced'],
                         'criterion': ['log_loss'], 'max_depth': [None],
                         'max_features': ['log2'], 'min_samples_leaf': [1],
                         'min_samples_split': [5],
                  

['C:\\Users\\franc\\AutoML\\df_optimisation.joblib']

In [12]:
joblib_path_suivi_metrique = r"C:\Users\franc\AutoML\df_optimisation.joblib"
df_optimisation = load(joblib_path_suivi_metrique)
df_optimisation

Unnamed: 0,Max_features,K_best,Grid,Hyperparamètres,Best_param,Accuracy,F1_weighted,F1_macro,Duree en sec
0,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.772136,0.772812,0.750719,1072.721818
1,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.772688,0.773371,0.750832,1872.766705
2,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.772136,0.772812,0.750719,1176.132017
3,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.779381,0.780637,0.757437,1647.827343
4,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.779381,0.780637,0.757437,1703.849127
5,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'class_weight': 'balanced', 'criterion': 'gin...",0.782758,0.782434,0.764278,773.802725
6,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': ['balanced'], 'random_state':...","{'bootstrap': False, 'class_weight': 'balanced...",0.782881,0.782675,0.764457,714.752244
7,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'class_weight': [None, 'balanced', 'balanced_...","{'bootstrap': False, 'class_weight': 'balanced...",0.782881,0.782675,0.764457,1777.47843
8,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'n_estimators': [100, 200, 300], 'criterion':...","{'criterion': 'gini', 'n_estimators': 300}",0.772013,0.771371,0.750619,5582.291008
9,4000,3000,"GridSearchCV(cv=5, estimator=RandomForestClass...","{'n_estimators': [50, 100, 200, 300], 'criteri...","{'bootstrap': False, 'class_weight': 'balanced...",0.775636,0.775725,0.75755,2151.491883


In [18]:
df_optimisation.loc[df_optimisation.F1_weighted==df_optimisation.F1_weighted.max(),"Best_param"].values

array(["{'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 23}",
       "{'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 23}"],
      dtype=object)