# 3. Modeling avec GridSearchCV

Ce notebook entra√Æne plusieurs mod√®les de classification, optimise leurs hyperparam√®tres avec GridSearchCV, compare leurs performances et s√©lectionne automatiquement le meilleur mod√®le

## Import des biblioth√®ques

In [3]:
import pandas as pd
import numpy as np
import pickle
import time
from datetime import datetime

# Mod√®les de classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# GridSearchCV pour optimisation
from sklearn.model_selection import GridSearchCV

# M√©triques
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix,
    make_scorer
)

import warnings
warnings.filterwarnings('ignore')

print("Biblioth√®ques import√©es avec succ√®s")

Biblioth√®ques import√©es avec succ√®s


## Chargement des features

In [4]:
# Charger les ensembles de features
with open('../data/pkl/feature_sets.pkl', 'rb') as f:
    feature_sets = pickle.load(f)

# Extraire les features et labels
X_train_tfidf = feature_sets['X_train_tfidf']
X_test_tfidf = feature_sets['X_test_tfidf']
X_train_count = feature_sets['X_train_count']
X_test_count = feature_sets['X_test_count']
X_train_svd = feature_sets['X_train_svd']
X_test_svd = feature_sets['X_test_svd']
X_train_combined = feature_sets['X_train_combined']
X_test_combined = feature_sets['X_test_combined']

y_train = feature_sets['y_train']
y_test = feature_sets['y_test']
label_encoder = feature_sets['label_encoder']

print("Features charg√©es avec succ√®s")
print(f"Nombre de classes: {len(label_encoder.classes_)}")

Features charg√©es avec succ√®s
Nombre de classes: 119


## D√©finition des mod√®les (sans Gradient Boosting)

In [5]:
# D√©finir les mod√®les de base (sans optimisation)
base_models = {
    'Logistic_Regression': LogisticRegression(
        max_iter=1000, 
        random_state=42,
        n_jobs=-1,
        solver='saga'
    ),
    'Multinomial_NB': MultinomialNB(),
    'Linear_SVC': LinearSVC(
        max_iter=1000, 
        random_state=42
    ),
    'Random_Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    'KNN': KNeighborsClassifier(
        n_neighbors=5,
        n_jobs=-1
    ),
    'Decision_Tree': DecisionTreeClassifier(
        max_depth=20,
        random_state=42
    )
}

print(f"Nombre de mod√®les d√©finis: {len(base_models)}")
print(f"Mod√®les: {list(base_models.keys())}")

Nombre de mod√®les d√©finis: 6
Mod√®les: ['Logistic_Regression', 'Multinomial_NB', 'Linear_SVC', 'Random_Forest', 'KNN', 'Decision_Tree']


## D√©finition des grilles d'hyperparam√®tres pour GridSearchCV

In [6]:
# Grilles d'hyperparam√®tres pour chaque mod√®le
param_grids = {
    'Logistic_Regression': {
        'C': [0.1, 1, 10, 100],
        'solver': ['saga', 'liblinear'],
        'max_iter': [1000, 2000]
    },
    'Multinomial_NB': {
        'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]
    },
    'Linear_SVC': {
        'C': [0.1, 1, 10, 100],
        'max_iter': [1000, 2000]
    },
    'Random_Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'Decision_Tree': {
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    }
}

print("Grilles d'hyperparam√®tres d√©finies:")
for model_name, params in param_grids.items():
    n_combinations = np.prod([len(v) for v in params.values()])
    print(f"  {model_name}: {n_combinations} combinaisons")

Grilles d'hyperparam√®tres d√©finies:
  Logistic_Regression: 16 combinaisons
  Multinomial_NB: 5 combinaisons
  Linear_SVC: 8 combinaisons
  Random_Forest: 18 combinaisons
  KNN: 8 combinaisons
  Decision_Tree: 12 combinaisons


In [7]:
# D√©finir les configurations de features
feature_configurations = {
    'TF-IDF': (X_train_tfidf, X_test_tfidf),
    'Count': (X_train_count, X_test_count),
    'SVD': (X_train_svd, X_test_svd),
    'Combined': (X_train_combined, X_test_combined)
}

# Mod√®les √† ignorer pour Combined (n√©cessitent normalisation)
skip_combined = ['Logistic_Regression', 'Linear_SVC', 'KNN']

print(f"\nNombre de configurations de features: {len(feature_configurations)}")
print(f"Configurations: {list(feature_configurations.keys())}")
print(f"\nMod√®les ignor√©s pour 'Combined': {skip_combined}")


Nombre de configurations de features: 4
Configurations: ['TF-IDF', 'Count', 'SVD', 'Combined']

Mod√®les ignor√©s pour 'Combined': ['Logistic_Regression', 'Linear_SVC', 'KNN']


## Fonction d'√©valuation avec GridSearchCV

In [8]:
def evaluate_model_with_gridsearch(model, param_grid, X_train, X_test, y_train, y_test, model_name):
    """
    Entra√Æne et √©value un mod√®le avec GridSearchCV
    Retourne les m√©triques, le meilleur mod√®le et les meilleurs param√®tres
    """
    print(f"    üîç GridSearchCV en cours...")
    
    # D√©finir le scorer
    scorer = make_scorer(f1_score, average='weighted', zero_division=0)
    
    # GridSearchCV
    start_time = time.time()
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=3,  # 3-fold cross-validation
        scoring=scorer,
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Meilleur mod√®le
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    
    # Pr√©dictions sur le test
    start_time = time.time()
    y_pred = best_model.predict(X_test)
    prediction_time = time.time() - start_time
    
    # Calcul des m√©triques
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_weighted': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall_weighted': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=0),
        'precision_macro': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_macro': f1_score(y_test, y_pred, average='macro', zero_division=0),
        'training_time': training_time,
        'prediction_time': prediction_time,
        'best_cv_score': best_cv_score
    }
    
    return metrics, best_model, y_pred, best_params

print("Fonction d'√©valuation avec GridSearchCV d√©finie")

Fonction d'√©valuation avec GridSearchCV d√©finie


## Entra√Ænement et √©valuation de tous les mod√®les avec GridSearchCV

In [9]:
# Dictionnaire pour stocker tous les r√©sultats
all_results = []
trained_models = {}
best_params_dict = {}

print("="*80)
print("D√âBUT DE L'ENTRA√éNEMENT DES MOD√àLES AVEC GRIDSEARCHCV")
print("="*80)
print("\n‚ö†Ô∏è  NOTE: GridSearchCV prend plus de temps mais optimise les hyperparam√®tres")
print("    Chaque mod√®le teste plusieurs combinaisons de param√®tres avec CV 3-fold\n")

total_combinations = sum(
    1 for feature_name in feature_configurations 
    for model_name in base_models 
    if not (feature_name == 'Combined' and model_name in skip_combined)
)
current_combination = 0

# Boucle sur toutes les combinaisons mod√®le-features
for feature_name, (X_train_feat, X_test_feat) in feature_configurations.items():
    print(f"\n{'='*80}")
    print(f"CONFIGURATION DE FEATURES: {feature_name}")
    print(f"{'='*80}")
    
    for model_name, model in base_models.items():
        
        # Ignorer Combined pour certains mod√®les
        if feature_name == 'Combined' and model_name in skip_combined:
            print(f"\n‚è≠Ô∏è  [{model_name}] Ignor√© pour {feature_name} (n√©cessite normalisation)")
            continue
        
        # Ignorer SVD pour Multinomial_NB (valeurs n√©gatives)
        if feature_name == 'SVD' and model_name == 'Multinomial_NB':
            print(f"\n‚è≠Ô∏è  [{model_name}] Ignor√© pour {feature_name} (valeurs n√©gatives)")
            continue
        
        current_combination += 1
        print(f"\n[{current_combination}/{total_combinations}] Entra√Ænement: {model_name} avec {feature_name}")
        
        try:
            # √âvaluer le mod√®le avec GridSearchCV
            metrics, trained_model, y_pred, best_params = evaluate_model_with_gridsearch(
                model, param_grids[model_name], X_train_feat, X_test_feat, y_train, y_test, model_name
            )
            
            # Stocker les r√©sultats
            result = {
                'model_name': model_name,
                'feature_config': feature_name,
                'combination': f"{model_name}_{feature_name}",
                **metrics
            }
            all_results.append(result)
            
            # Stocker le mod√®le entra√Æn√©
            model_key = f"{model_name}_{feature_name}"
            trained_models[model_key] = {
                'model': trained_model,
                'metrics': metrics,
                'predictions': y_pred
            }
            
            # Stocker les meilleurs param√®tres
            best_params_dict[model_key] = best_params
            
            # Afficher les r√©sultats
            print(f"    ‚úì Meilleurs param√®tres: {best_params}")
            print(f"    ‚úì CV Score: {metrics['best_cv_score']:.4f}")
            print(f"    ‚úì Test Accuracy: {metrics['accuracy']:.4f}")
            print(f"    ‚úì Test F1-Score (weighted): {metrics['f1_weighted']:.4f}")
            print(f"    ‚úì Test F1-Score (macro): {metrics['f1_macro']:.4f}")
            print(f"    ‚úì Temps d'entra√Ænement: {metrics['training_time']:.2f}s")
            
        except Exception as e:
            print(f"    ‚ùå ERREUR: {str(e)}")
            continue

print(f"\n{'='*80}")
print("ENTRA√éNEMENT TERMIN√â")
print(f"{'='*80}")

D√âBUT DE L'ENTRA√éNEMENT DES MOD√àLES AVEC GRIDSEARCHCV

‚ö†Ô∏è  NOTE: GridSearchCV prend plus de temps mais optimise les hyperparam√®tres
    Chaque mod√®le teste plusieurs combinaisons de param√®tres avec CV 3-fold


CONFIGURATION DE FEATURES: TF-IDF

[1/21] Entra√Ænement: Logistic_Regression avec TF-IDF
    üîç GridSearchCV en cours...
    ‚úì Meilleurs param√®tres: {'C': 100, 'max_iter': 1000, 'solver': 'saga'}
    ‚úì CV Score: 0.7213
    ‚úì Test Accuracy: 0.7337
    ‚úì Test F1-Score (weighted): 0.7340
    ‚úì Test F1-Score (macro): 0.7265
    ‚úì Temps d'entra√Ænement: 67.58s

[2/21] Entra√Ænement: Multinomial_NB avec TF-IDF
    üîç GridSearchCV en cours...
    ‚úì Meilleurs param√®tres: {'alpha': 0.1}
    ‚úì CV Score: 0.6191
    ‚úì Test Accuracy: 0.6443
    ‚úì Test F1-Score (weighted): 0.6330
    ‚úì Test F1-Score (macro): 0.6444
    ‚úì Temps d'entra√Ænement: 0.29s

[3/21] Entra√Ænement: Linear_SVC avec TF-IDF
    üîç GridSearchCV en cours...
    ‚úì Meilleurs param√®t

## Comparaison des performances

In [10]:
# Cr√©er un DataFrame avec tous les r√©sultats
results_df = pd.DataFrame(all_results)

# Trier par F1-score weighted (m√©trique principale)
results_df = results_df.sort_values('f1_weighted', ascending=False)

print("\n" + "="*80)
print("COMPARAISON DES PERFORMANCES (Top 10)")
print("="*80)
print(results_df[[
    'combination', 'accuracy', 'f1_weighted', 'f1_macro', 'best_cv_score'
]].head(10).to_string(index=False))


COMPARAISON DES PERFORMANCES (Top 10)
               combination  accuracy  f1_weighted  f1_macro  best_cv_score
       Random_Forest_Count  0.756098     0.744871  0.748154       0.705734
 Logistic_Regression_Count  0.739837     0.737932  0.728253       0.726695
Logistic_Regression_TF-IDF  0.733740     0.733996  0.726542       0.721303
    Random_Forest_Combined  0.747967     0.733599  0.728592       0.715459
   Logistic_Regression_SVD  0.733740     0.733250  0.733212       0.708703
          Linear_SVC_Count  0.731707     0.727082  0.702039       0.726928
      Random_Forest_TF-IDF  0.737805     0.721408  0.699610       0.695688
            Linear_SVC_SVD  0.727642     0.720357  0.713030       0.700445
         Linear_SVC_TF-IDF  0.723577     0.719650  0.712294       0.723389
    Decision_Tree_Combined  0.713415     0.709899  0.701447       0.605057


## S√©lection automatique du meilleur mod√®le

In [11]:
# S√©lectionner le meilleur mod√®le bas√© sur le F1-score weighted
best_result = results_df.iloc[0]
best_model_key = best_result['combination']
best_model_info = trained_models[best_model_key]
best_params = best_params_dict[best_model_key]

print("="*80)
print("MEILLEUR MOD√àLE S√âLECTIONN√â AUTOMATIQUEMENT (OPTIMIS√â AVEC GRIDSEARCHCV)")
print("="*80)
print(f"\nCombination: {best_result['combination']}")
print(f"Mod√®le: {best_result['model_name']}")
print(f"Features: {best_result['feature_config']}")
print(f"\nMEILLEURS HYPERPARAM√àTRES:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print(f"\nM√âTRIQUES DE PERFORMANCE:")
print(f"  CV Score (3-fold): {best_result['best_cv_score']:.4f}")
print(f"  Test Accuracy: {best_result['accuracy']:.4f}")
print(f"  Test Precision (weighted): {best_result['precision_weighted']:.4f}")
print(f"  Test Recall (weighted): {best_result['recall_weighted']:.4f}")
print(f"  Test F1-Score (weighted): {best_result['f1_weighted']:.4f}")
print(f"  Test F1-Score (macro): {best_result['f1_macro']:.4f}")
print(f"\nTEMPS:")
print(f"  Entra√Ænement (avec GridSearch): {best_result['training_time']:.2f}s")
print(f"  Pr√©diction: {best_result['prediction_time']:.4f}s")

MEILLEUR MOD√àLE S√âLECTIONN√â AUTOMATIQUEMENT (OPTIMIS√â AVEC GRIDSEARCHCV)

Combination: Random_Forest_Count
Mod√®le: Random_Forest
Features: Count

MEILLEURS HYPERPARAM√àTRES:
  max_depth: 30
  min_samples_split: 2
  n_estimators: 200

M√âTRIQUES DE PERFORMANCE:
  CV Score (3-fold): 0.7057
  Test Accuracy: 0.7561
  Test Precision (weighted): 0.7734
  Test Recall (weighted): 0.7561
  Test F1-Score (weighted): 0.7449
  Test F1-Score (macro): 0.7482

TEMPS:
  Entra√Ænement (avec GridSearch): 10.51s
  Pr√©diction: 0.1445s


## Rapport d√©taill√© pour le meilleur mod√®le

In [12]:
# G√©n√©rer le rapport de classification pour le meilleur mod√®le
best_predictions = best_model_info['predictions']

print("\n" + "="*80)
print("RAPPORT DE CLASSIFICATION D√âTAILL√â (Meilleur Mod√®le Optimis√©)")
print("="*80)

# Ne garder que les classes r√©ellement pr√©sentes dans y_test ou dans les pr√©dictions
present_labels = np.unique(np.concatenate([y_test.values, best_predictions]))
target_names = label_encoder.inverse_transform(present_labels)

print(classification_report(
    y_test,
    best_predictions,
    labels=present_labels,
    target_names=target_names,
    zero_division=0
))


RAPPORT DE CLASSIFICATION D√âTAILL√â (Meilleur Mod√®le Optimis√©)
                                              precision    recall  f1-score   support

                               AI Researcher       0.80      0.80      0.80         5
                        Academic Coordinator       1.00      1.00      1.00         3
                           Account Executive       0.67      1.00      0.80         2
                             Account Manager       1.00      1.00      1.00         2
                                  Accountant       0.50      0.88      0.64         8
                    Administrative Assistant       1.00      1.00      1.00         3
                                Art Director       1.00      0.67      0.80         3
                                    Attorney       1.00      1.00      1.00         4
                                  BI Analyst       0.83      1.00      0.91         5
                           Backend Developer       0.45      0.42      0

## Tableau des meilleurs hyperparam√®tres par mod√®le

In [13]:
print("\n" + "="*80)
print("MEILLEURS HYPERPARAM√àTRES TROUV√âS PAR GRIDSEARCHCV")
print("="*80)

for model_key, params in best_params_dict.items():
    model_result = results_df[results_df['combination'] == model_key].iloc[0]
    print(f"\n{model_key}:")
    print(f"  F1-Score: {model_result['f1_weighted']:.4f}")
    print(f"  Param√®tres:")
    for param, value in params.items():
        print(f"    - {param}: {value}")


MEILLEURS HYPERPARAM√àTRES TROUV√âS PAR GRIDSEARCHCV

Logistic_Regression_TF-IDF:
  F1-Score: 0.7340
  Param√®tres:
    - C: 100
    - max_iter: 1000
    - solver: saga

Multinomial_NB_TF-IDF:
  F1-Score: 0.6330
  Param√®tres:
    - alpha: 0.1

Linear_SVC_TF-IDF:
  F1-Score: 0.7197
  Param√®tres:
    - C: 10
    - max_iter: 1000

Random_Forest_TF-IDF:
  F1-Score: 0.7214
  Param√®tres:
    - max_depth: 30
    - min_samples_split: 2
    - n_estimators: 200

KNN_TF-IDF:
  F1-Score: 0.6318
  Param√®tres:
    - n_neighbors: 9
    - weights: distance

Decision_Tree_TF-IDF:
  F1-Score: 0.6425
  Param√®tres:
    - max_depth: None
    - min_samples_split: 5

Logistic_Regression_Count:
  F1-Score: 0.7379
  Param√®tres:
    - C: 10
    - max_iter: 1000
    - solver: liblinear

Multinomial_NB_Count:
  F1-Score: 0.6416
  Param√®tres:
    - alpha: 1.0

Linear_SVC_Count:
  F1-Score: 0.7271
  Param√®tres:
    - C: 0.1
    - max_iter: 1000

Random_Forest_Count:
  F1-Score: 0.7449
  Param√®tres:
    - 

## Analyse par type de mod√®le

In [14]:
# Performance moyenne par mod√®le
print("\n" + "="*80)
print("PERFORMANCE MOYENNE PAR TYPE DE MOD√àLE (APR√àS OPTIMISATION)")
print("="*80)

model_performance = results_df.groupby('model_name')[[
    'accuracy', 'f1_weighted', 'f1_macro'
]].agg(['mean', 'max']).round(4)

print(model_performance)


PERFORMANCE MOYENNE PAR TYPE DE MOD√àLE (APR√àS OPTIMISATION)
                    accuracy         f1_weighted         f1_macro        
                        mean     max        mean     max     mean     max
model_name                                                               
Decision_Tree         0.6260  0.7134      0.6214  0.7099   0.6156  0.7014
KNN                   0.6104  0.6504      0.6001  0.6318   0.6138  0.6334
Linear_SVC            0.7276  0.7317      0.7224  0.7271   0.7091  0.7130
Logistic_Regression   0.7358  0.7398      0.7351  0.7379   0.7293  0.7332
Multinomial_NB        0.6457  0.6565      0.6340  0.6416   0.6422  0.6532
Random_Forest         0.7292  0.7561      0.7170  0.7449   0.7130  0.7482


In [15]:
# Performance moyenne par configuration de features
print("\n" + "="*80)
print("PERFORMANCE MOYENNE PAR CONFIGURATION DE FEATURES")
print("="*80)

feature_performance = results_df.groupby('feature_config')[[
    'accuracy', 'f1_weighted', 'f1_macro'
]].agg(['mean', 'max']).round(4)

print(feature_performance)


PERFORMANCE MOYENNE PAR CONFIGURATION DE FEATURES
               accuracy         f1_weighted         f1_macro        
                   mean     max        mean     max     mean     max
feature_config                                                      
Combined         0.6992  0.7480      0.6903  0.7336   0.6944  0.7286
Count            0.6880  0.7561      0.6808  0.7449   0.6767  0.7482
SVD              0.6427  0.7337      0.6380  0.7333   0.6418  0.7332
TF-IDF           0.6900  0.7378      0.6804  0.7340   0.6742  0.7265


## Comparaison: Am√©lioration apport√©e par GridSearchCV

In [16]:
print("\n" + "="*80)
print("AM√âLIORATION CV vs TEST")
print("="*80)

# Comparer CV score et test score
comparison_df = results_df[['combination', 'best_cv_score', 'f1_weighted']].copy()
comparison_df['difference'] = comparison_df['f1_weighted'] - comparison_df['best_cv_score']
comparison_df = comparison_df.sort_values('f1_weighted', ascending=False)

print("\nTop 10 mod√®les - Comparaison CV vs Test:")
print(comparison_df.head(10).to_string(index=False))

print("\nüìä Statistiques:")
print(f"  Diff√©rence moyenne (Test - CV): {comparison_df['difference'].mean():.4f}")
print(f"  Diff√©rence m√©diane: {comparison_df['difference'].median():.4f}")
print(f"  √âcart-type: {comparison_df['difference'].std():.4f}")


AM√âLIORATION CV vs TEST

Top 10 mod√®les - Comparaison CV vs Test:
               combination  best_cv_score  f1_weighted  difference
       Random_Forest_Count       0.705734     0.744871    0.039138
 Logistic_Regression_Count       0.726695     0.737932    0.011237
Logistic_Regression_TF-IDF       0.721303     0.733996    0.012693
    Random_Forest_Combined       0.715459     0.733599    0.018140
   Logistic_Regression_SVD       0.708703     0.733250    0.024547
          Linear_SVC_Count       0.726928     0.727082    0.000154
      Random_Forest_TF-IDF       0.695688     0.721408    0.025720
            Linear_SVC_SVD       0.700445     0.720357    0.019912
         Linear_SVC_TF-IDF       0.723389     0.719650   -0.003739
    Decision_Tree_Combined       0.605057     0.709899    0.104842

üìä Statistiques:
  Diff√©rence moyenne (Test - CV): 0.0244
  Diff√©rence m√©diane: 0.0205
  √âcart-type: 0.0233


## Sauvegarde des r√©sultats et mod√®les

In [18]:
# Sauvegarder tous les r√©sultats
modeling_results = {
    'all_results': results_df,
    'trained_models': trained_models,
    'best_params': best_params_dict,
    'best_model_key': best_model_key,
    'best_model': best_model_info['model'],
    'best_metrics': best_model_info['metrics'],
    'best_predictions': best_predictions,
    'best_hyperparams': best_params,
    'label_encoder': label_encoder,
    'y_test': y_test,
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'gridsearch_used': True
}

with open('../data/pkl/modeling_results_gridsearch.pkl', 'wb') as f:
    pickle.dump(modeling_results, f)

print("R√©sultats du modeling avec GridSearch sauvegard√©s: pkl/modeling_results_gridsearch.pkl")

R√©sultats du modeling avec GridSearch sauvegard√©s: pkl/modeling_results_gridsearch.pkl


## R√©sum√© du Modeling avec GridSearchCV

In [19]:
print("\n" + "="*80)
print("R√âSUM√â DU MODELING AVEC GRIDSEARCHCV")
print("="*80)

print(f"\n1. MOD√àLES ENTRA√éN√âS:")
print(f"   - Nombre de types de mod√®les: {len(base_models)}")
print(f"   - Nombre de configurations de features: {len(feature_configurations)}")
print(f"   - Total de combinaisons test√©es: {len(results_df)}")
print(f"   - Mod√®les ignor√©s pour 'Combined': {len(skip_combined)}")

print(f"\n2. OPTIMISATION GRIDSEARCHCV:")
print(f"   - Cross-validation: 3-fold")
print(f"   - M√©trique d'optimisation: F1-Score weighted")
print(f"   - Hyperparam√®tres optimis√©s pour chaque mod√®le: Oui")

print(f"\n3. MEILLEUR MOD√àLE:")
print(f"   - Nom: {best_result['model_name']}")
print(f"   - Features: {best_result['feature_config']}")
print(f"   - CV F1-Score: {best_result['best_cv_score']:.4f}")
print(f"   - Test F1-Score (weighted): {best_result['f1_weighted']:.4f}")
print(f"   - Test Accuracy: {best_result['accuracy']:.4f}")

print(f"\n4. TOP 3 MOD√àLES:")
for idx, row in results_df.head(3).iterrows():
    print(f"   {idx + 1}. {row['combination']} - F1: {row['f1_weighted']:.4f}")

print(f"\n5. FICHIERS SAUVEGARD√âS:")
print(f"   - pkl/modeling_results_gridsearch.pkl (tous les mod√®les et r√©sultats)")

print(f"\n6. AM√âLIORATION vs MOD√àLES DE BASE:")
print(f"   - GridSearchCV a optimis√© les hyperparam√®tres")
print(f"   - Cross-validation assure la g√©n√©ralisation")
print(f"   - Performance valid√©e sur ensemble de test")

print("\n" + "="*80)
print("MODELING AVEC GRIDSEARCHCV TERMIN√â AVEC SUCC√àS")
print("="*80)


R√âSUM√â DU MODELING AVEC GRIDSEARCHCV

1. MOD√àLES ENTRA√éN√âS:
   - Nombre de types de mod√®les: 6
   - Nombre de configurations de features: 4
   - Total de combinaisons test√©es: 20
   - Mod√®les ignor√©s pour 'Combined': 3

2. OPTIMISATION GRIDSEARCHCV:
   - Cross-validation: 3-fold
   - M√©trique d'optimisation: F1-Score weighted
   - Hyperparam√®tres optimis√©s pour chaque mod√®le: Oui

3. MEILLEUR MOD√àLE:
   - Nom: Random_Forest
   - Features: Count
   - CV F1-Score: 0.7057
   - Test F1-Score (weighted): 0.7449
   - Test Accuracy: 0.7561

4. TOP 3 MOD√àLES:
   10. Random_Forest_Count - F1: 0.7449
   7. Logistic_Regression_Count - F1: 0.7379
   1. Logistic_Regression_TF-IDF - F1: 0.7340

5. FICHIERS SAUVEGARD√âS:
   - pkl/modeling_results_gridsearch.pkl (tous les mod√®les et r√©sultats)

6. AM√âLIORATION vs MOD√àLES DE BASE:
   - GridSearchCV a optimis√© les hyperparam√®tres
   - Cross-validation assure la g√©n√©ralisation
   - Performance valid√©e sur ensemble de test

MODELI