In [1]:
#transformation du texte sous forme de liste en string
def jonction(liste):
  result = ""
  for token in liste:
    result = result + token + " "
  return result

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from joblib import load,dump
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
import time
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def test_model(df_post_traduction,colonne_a_vectoriser,max_features,k_best,joblib_path_suivi_metrique,liste_modele):
    #import du df
    df = load(df_post_traduction)
    #transformation de la future colonne feature en chaine de caractère
    df[colonne_a_vectoriser]=df[colonne_a_vectoriser].apply(jonction)
    #séparation des features et de la cible
    data = df[colonne_a_vectoriser]
    target = df["prdtypecode"]
    #séparation de train et test
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, stratify = target)
    #vectorisation selon un nombre max de mots les plus fréquents
    vectorizer = CountVectorizer(max_features=max_features)
    X_train = vectorizer.fit_transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()
    scaler = StandardScaler().fit(np.asarray(X_train))
    X_train = scaler.transform(np.asarray(X_train))
    X_test = scaler.transform(np.asarray(X_test))
    sel = SelectKBest(k=k_best)
    sel.fit(np.asarray(X_train),y_train)
    X_train = sel.transform(np.asarray(X_train))
    X_test = sel.transform(np.asarray(X_test))
    if os.path.exists(joblib_path_suivi_metrique):
        df_import = load(joblib_path_suivi_metrique)
        print("récupération du df existant")
    else:
        df_import = pd.DataFrame(columns=["Max_features","K_best","Model", "Accuracy", "F1_weighted", "F1_macro", "Duree en sec"])
        print("création d'un dataframe")
    score = []
    for model in liste_modele:
        print("debut du modèle:",model)
        debut = time.time()
        model.fit(np.asarray(X_train),y_train)
        y_pred = model.predict(np.asarray(X_test))
        print("fin du modèle:",model)
        accuracy = accuracy_score(y_test, y_pred)
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        f1_macro = f1_score(y_test, y_pred, average='macro')
        fin = time.time()
        duree = fin - debut
        model_scores = { 
            "Max_features": f"{max_features}",
            "K_best":f"{k_best}",
            "Model": f"{model}",
            "Accuracy": accuracy,
            "F1_weighted": f1_weighted,
            "F1_macro": f1_macro,
            "Duree en sec": duree}
        score.append(model_scores)
    df_score = pd.DataFrame(score)
    df = pd.concat([df_import, df_score], ignore_index=True)
    return df


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

"""lr = LogisticRegression(random_state=23,class_weight="balanced")
knn = KNeighborsClassifier(n_jobs=-1)
dt = DecisionTreeClassifier(random_state=23,class_weight="balanced")
svc = svm.SVC(random_state=23,class_weight="balanced") #probleme 
rdf = RandomForestClassifier(n_jobs = -1,random_state = 23,class_weight="balanced")
gbc = GradientBoostingClassifier(random_state = 23)
brdf = BalancedRandomForestClassifier(random_state = 23,class_weight="balanced")"""
#lr = LogisticRegression(random_state=23,)
#lr = LogisticRegression(random_state=23, max_iter=1000)
lr = LogisticRegression(random_state=23, max_iter=1000,solver='saga')
svc = svm.SVC(C=0.1,kernel = "linear", random_state=23)
knn = KNeighborsClassifier(n_jobs=-1)
dt = DecisionTreeClassifier(random_state=23)
rdf = RandomForestClassifier(n_jobs = -1,random_state = 23)
gbc = GradientBoostingClassifier(random_state = 23)
brdf = BalancedRandomForestClassifier(random_state = 23) 

In [None]:
df_post_traduction = r"C:\Users\franc\AutoML\df_train_post_trad.joblib"
colonne_a_vectoriser = "mots_stem_sans_chiffres"
max_features = 4000
k_best = 3000
joblib_path_suivi_metrique = r"C:\Users\franc\AutoML\df_score.joblib"
liste_modele = [lr,svc]
df_score = test_model(df_post_traduction,colonne_a_vectoriser,max_features,k_best,joblib_path_suivi_metrique,liste_modele)
dump(df_score,r"C:\Users\franc\AutoML\df_score.joblib")

récupération du df existant
debut du modèle: LogisticRegression(max_iter=1000, random_state=23, solver='saga')


In [12]:
df= load(r"C:\Users\franc\AutoML\df_score.joblib")
df

Unnamed: 0,Max_features,K_best,Model,Accuracy,F1_weighted,F1_macro,Duree en sec
0,4000,3000,"LogisticRegression(class_weight='balanced', ra...",0.737689,0.742634,0.713987,33.074111
1,4000,3000,KNeighborsClassifier(n_jobs=-1),0.57356,0.580204,0.539838,45.720312
2,4000,3000,DecisionTreeClassifier(class_weight='balanced'...,0.699067,0.702138,0.673113,83.353033
3,4000,3000,RandomForestClassifier(class_weight='balanced'...,0.772136,0.772812,0.750719,77.130688
4,4000,3000,BalancedRandomForestClassifier(class_weight='b...,0.709382,0.718596,0.679694,145.13595
5,4000,3000,LogisticRegression(random_state=23),0.744013,0.746604,0.726534,33.781977
6,4000,3000,DecisionTreeClassifier(random_state=23),0.706803,0.708297,0.679179,80.784523
7,4000,3000,"RandomForestClassifier(n_jobs=-1, random_state...",0.768758,0.767993,0.74748,78.34878
8,4000,3000,BalancedRandomForestClassifier(random_state=23),0.737075,0.74137,0.712791,136.097793
9,4000,3000,"LogisticRegression(max_iter=1000, random_state...",0.729277,0.731983,0.706516,318.753879
