In [7]:
# Bloc complet : pipeline complet pour la prédiction du Mode de Propulsion

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Chargement des données
df = pd.read_excel(r'C:\Users\pieta\OneDrive\Bureau\tracéo_justin\Usual_analysis\data\Expdatatangedpoint05_07_22_testhadrien.xlsx')

# 2. Nettoyage de base
df.drop(columns=['Tool ID', 'Shoot ID', 'Fracture ID', 'Type'], inplace=True)
df['Propagation Phase Lenght'] = pd.to_numeric(df['Propagation Phase Lenght'], errors='coerce')

# 3. Conversion des colonnes pertinentes en 'category'
cat_cols = ['Impacted material','State after shoot','Initation','Locus','Location of initiation','Profile of initation','General Direction',
            'Location of termination','Termination','fracture composition','Fracture part','Fracture group','enought traces for determination','attribute group']
df[cat_cols] = df[cat_cols].astype('category')
df['Penetration'] = pd.to_numeric(df['Penetration'], errors='coerce')

# Forcer toutes les colonnes catégorielles à être des chaînes de caractères
for col in cat_cols:
    df[col] = df[col].astype(str)


# 4. Séparation X/y
X = df[cat_cols + ['Penetration']]
y = df['Mode of Propulsion']

# 5. Encodage de la cible
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 6. Split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, stratify=y_enc, test_size=0.2, random_state=42)

# 7. Pipeline de prétraitement
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_cols)
], remainder='passthrough')

# 8. Modèle Random Forest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42))
])
pipeline.fit(X_train, y_train)

# 9. Analyse des variables importantes
encoded_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'] \
    .named_steps['encoder'].get_feature_names_out(cat_cols)
all_feature_names = list(encoded_names) + ['Penetration']

importances = pipeline.named_steps['classifier'].feature_importances_
importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 des variables les plus importantes:")
print(importance_df.head(10))

# 10. Évaluation finale
y_pred = pipeline.predict(X_test)
print("\nRapport de classification:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))



Top 10 des variables les plus importantes:
                             Feature  Importance
14                       Penetration    0.310062
6                  General Direction    0.105277
8                        Termination    0.098073
12  enought traces for determination    0.080016
0                  Impacted material    0.077043
2                          Initation    0.066196
5               Profile of initation    0.049861
3                              Locus    0.043172
4             Location of initiation    0.041023
13                   attribute group    0.039253

Rapport de classification:
               precision    recall  f1-score   support

          Bow       1.00      0.25      0.40         4
Spear thrower       0.67      0.73      0.70        11
     Throwing       0.50      0.56      0.53         9
    Thrusting       0.77      0.83      0.80        12

     accuracy                           0.67        36
    macro avg       0.73      0.59      0.61        36
 w

In [11]:
# Bloc : Entraînement final du modèle avec les variables importantes uniquement

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Utiliser automatiquement les colonnes les plus importantes (top 6)
important_cols = importance_df.sort_values(by='Importance', ascending=False)['Feature'].head(10).tolist()

# Recréer X et y
X_final = df[important_cols]
y_final = df['Mode of Propulsion']

y_final_enc = le.fit_transform(y_final)

# Split final
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_final, y_final_enc, stratify=y_final_enc, test_size=0.2, random_state=42)

# Identifier les colonnes catégorielles (sauf Penetration)
final_cat_cols = [col for col in important_cols if col != 'Penetration']

# Créer pipeline
final_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

final_preprocessor = ColumnTransformer([
    ('cat', final_cat_pipeline, final_cat_cols)
], remainder='passthrough')

final_pipeline = Pipeline([
    ('preprocessor', final_preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42))
])

# Entraînement
final_pipeline.fit(X_train_f, y_train_f)

# Prédiction
final_pred = final_pipeline.predict(X_test_f)

# Rapport final
y_test_decoded = le.inverse_transform(y_test_f)
y_pred_decoded = le.inverse_transform(final_pred)

print("\nRapport de classification (modèle final avec variables importantes):")
print(classification_report(y_test_decoded, y_pred_decoded))




Rapport de classification (modèle final avec variables importantes):
               precision    recall  f1-score   support

          Bow       1.00      0.50      0.67         4
Spear thrower       0.69      0.82      0.75        11
     Throwing       0.57      0.44      0.50         9
    Thrusting       0.79      0.92      0.85        12

     accuracy                           0.72        36
    macro avg       0.76      0.67      0.69        36
 weighted avg       0.73      0.72      0.71        36



Très bon score global, surtout pour un modèle réduit à 6 variables 🎯

---

### 🔍 Analyse rapide :

| Classe            | Precision | Recall | F1-score | Remarque                                                   |
| ----------------- | --------- | ------ | -------- | ---------------------------------------------------------- |
| **Bow**           | 1.00      | 0.50   | 0.67     | 🔥 Précision parfaite, mais manque de rappel (2/4 trouvés) |
| **Spear thrower** | 0.69      | 0.82   | 0.75     | ✅ Très bon résultat, stable                                |
| **Throwing**      | 0.57      | 0.44   | 0.50     | ⚠️ Plus difficile à distinguer                             |
| **Thrusting**     | 0.79      | 0.92   | 0.85     | ✅ Excellente performance                                   |

---

### 📊 Bilan :

* **Accuracy globale : 72%** → très solide pour un modèle réduit
* **Weighted F1 : 0.71** → bien équilibré
* Seules **quelques erreurs sur "Throwing"** : peut-être un manque de données ou confusion sémantique

---

### ✅ Prochaines améliorations possibles :

* Ajouter 1 ou 2 variables de plus (top 8 au lieu de 6) pour voir si la classe “Throwing” s’améliore
* Suréchantillonnage léger de “Bow” ou “Throwing” si ce sont des classes minoritaires
* Tester `class_weight='balanced'` dans `RandomForestClassifier`

