In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import classification_report, f1_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler


In [14]:
acc_dtypes = {
    "place":"category",
    "catu":"category",
    "grav":"category",
    "sexe":"category",
    "trajet":"category",
    "locp":"category",
    "actp":"category",
    "etatp":"category",
    "secuUn":"category",
    "secuDeux":"category",
    "tranches_ages":"category",
    "catr":"category",
    "circ":"category",
    "vosp":"category",
    "prof":"category",
    "plan":"category",
    "surf":"category",
    "infra":"category",
    "situ":"category",
    "senc":"category",
    "obs":"category",
    "obsm":"category",
    "choc":"category",
    "manv":"category",
    "catv_Label":"category",
    "lum":"category",
    "agg":"category",
    "int":"category",
    "atm":"category",
    "col":"category",
    "jour_de_la_semaine":"category",
    "heure":"category",
    "dep": "category"
    }
df = pd.read_csv(r"C:\Users\maill\Documents\GitHub\SARA\data\fusion3.csv", low_memory=False)
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df = df.drop(['Unnamed: 0','num_acc','an_nais','an_naiss','age_acc_an','num_veh','senc','occutc','permis','secuDeux','date','com'], axis=1)
df['place'] = df['place'].astype('object')
df['dep'] = df['dep'].replace({'2A':201,'2B':202})
df['dep'] = df['dep'].astype('int64')
                               
df = df.dropna()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2291745 entries, 0 to 2291796
Data columns (total 34 columns):
 #   Column              Dtype  
---  ------              -----  
 0   place               object 
 1   catu                object 
 2   grav                object 
 3   sexe                object 
 4   trajet              object 
 5   locp                object 
 6   actp                object 
 7   etatp               object 
 8   secuUn              object 
 9   tranches_ages       object 
 10  catr                object 
 11  circ                object 
 12  nbv                 float64
 13  vosp                object 
 14  prof                object 
 15  plan                object 
 16  surf                object 
 17  infra               object 
 18  situ                object 
 19  obs                 object 
 20  obsm                object 
 21  choc                object 
 22  manv                object 
 23  catv_Label          object 
 24  lum                 obje

In [11]:
X = df.drop('grav',axis=1)
y = df['grav']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
categorical_variable = make_column_selector(dtype_exclude=np.number)
tranformer = make_column_transformer((OneHotEncoder(), categorical_variable))

model = make_pipeline(tranformer,
                      RandomUnderSampler(),
                      RandomForestClassifier(random_state=42,verbose=2),verbose=2)

In [13]:
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
model.predict(X_test)

# Evaluation

In [None]:
def evaluation(model):
    model.fit(Xconfusion_matrixusion_matrixusion_matrixin, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    N, train_score, val_score = learning_curve(model, X_train, y_train,
                                               cv=4, scoring='f1',
                                               train_sizes = np.linspace(0.1,1,10))
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train_score')
    plt.plot(N, valcore.mean(axis=1), label='val_score')

In [None]:
pd.DataFrame(model.feature_importances_, index= X_train.colums).plot.bar()

# Optimisation

# Precision Recall Curve

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, grid.best_estimator_.decision_function(X_test))

plt.plot(threshold, precision[:-1], label='precision')
plt.plot(threshold, recall[:-1], label='recall')
plt.legend()

In [None]:
def model_final(model, X, threshold=0):
    return model.decision_function(X) > threshold

In [None]:
y_pred = model_final(grid.best_estimator_, X_test, threshold=-1)

<h1>Encodage des variables</h1>

Les variables sont encodées une à une afin de pallier à un déficit de mémoire sur certaines machines.

In [4]:
df = pd.get_dummies(df, columns=['catu'])

In [5]:
le = LabelEncoder()
df['sexe'] = le.fit_transform(df['sexe'])

In [6]:
df = pd.get_dummies(df, columns = ['trajet'])

In [7]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.02 

# Identifier les catégories rares
#rare_categories = df['locp'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['locp'] = df['locp'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['locp'])



In [8]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.02 

# Identifier les catégories rares
#rare_categories = df['actp'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['actp'] = df['actp'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['actp'])

In [9]:
df = pd.get_dummies(df, columns = ['etatp'])

In [10]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.01

# Identifier les catégories rares
#rare_categories = df['secuUn'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['secuUn'] = df['secuUn'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['secuUn'])

In [11]:
df = pd.get_dummies(df, columns = ['tranches_ages'])

In [12]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.01

# Identifier les catégories rares
#rare_categories = df['catr'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['catr'] = df['catr'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['catr'])

In [13]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.01

# Identifier les catégories rares
#rare_categories = df['circ'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['circ'] = df['circ'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['circ'])

In [14]:
# Binary Encoding
df['vosp'] = df['vosp'].apply(lambda x: 0 if x == 'Sans objet(0)' else 1)

In [15]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.05

# Identifier les catégories rares
#rare_categories = df['prof'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['prof'] = df['prof'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['prof'])

In [16]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.05

# Identifier les catégories rares
#rare_categories = df['plan'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['plan'] = df['plan'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['plan'])

In [17]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.02

# Identifier les catégories rares
#rare_categories = df['surf'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['surf'] = df['surf'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['surf'])

In [18]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.02

# Identifier les catégories rares
#rare_categories = df['infra'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['infra'] = df['infra'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['infra'])

In [19]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.05

# Identifier les catégories rares
#rare_categories = df['situ'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['situ'] = df['situ'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['situ'])

In [20]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.02

# Identifier les catégories rares
#rare_categories = df['obs'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['obs'] = df['obs'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['obs'])

In [21]:
# Seuil pour le regroupement des catégories rares
#threshold = 0.01

# Identifier les catégories rares
#rare_categories = df['obsm'].value_counts(normalize=True)
#rare_categories = rare_categories[rare_categories < threshold].index

# Remplacer les catégories rares par 'Autre'
#df['obsm'] = df['obsm'].replace(rare_categories, 'Autre')

# One-Hot Encoding
df = pd.get_dummies(df, columns=['obsm'])

In [22]:
# Regrouper la catégorie "Non renseigné" avec "Aucun"
df['choc'] = df['choc'].replace('Non renseigné', 'Aucun')

# Effectuer le one-hot encoding
df = pd.get_dummies(df, columns=['choc'])


In [23]:
frequency_encoding = df['manv'].value_counts(normalize=True)
df['manv'] = df['manv'].map(frequency_encoding)

In [24]:
frequency_encoding = df['catv_Label'].value_counts(normalize=True)
df['catv_Label'] = df['catv_Label'].map(frequency_encoding)

In [25]:
frequency_encoding = df['dep'].value_counts(normalize=True)
df['dep'] = df['dep'].map(frequency_encoding)

In [26]:
frequency_encoding = df['com'].value_counts(normalize=True)
df['com'] = df['com'].map(frequency_encoding)

In [27]:
df = pd.get_dummies(df, columns = ['lum'])

In [28]:
df['agg'] = df['agg'].replace({'En agglomération': 1, 'Hors agglomération': 0})

In [29]:
df = pd.get_dummies(df, columns = ['int'])

In [30]:
df = pd.get_dummies(df, columns = ['atm'])

In [31]:
df = pd.get_dummies(df, columns = ['col'])

In [32]:
df = pd.get_dummies(df, columns = ['jour_de_la_semaine'])

In [33]:
df.shape

(2291739, 195)

In [34]:
df.columns

Index(['place', 'grav', 'sexe', 'nbv', 'vosp', 'manv', 'catv_Label', 'agg',
       'com', 'dep',
       ...
       'col_Sans collision',
       'col_Trois véhicules et plus - collisions multiples',
       'col_Trois véhicules et plus – en chaîne',
       'jour_de_la_semaine_Dimanche', 'jour_de_la_semaine_Jeudi',
       'jour_de_la_semaine_Lundi', 'jour_de_la_semaine_Mardi',
       'jour_de_la_semaine_Mercredi', 'jour_de_la_semaine_Samedi',
       'jour_de_la_semaine_Vendredi'],
      dtype='object', length=195)

In [35]:
X = df.drop('grav',axis=1)
y = df['grav']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h1>Rééquilibrage de classe</h1>

In [72]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)

# Rééchantillonnage de l'ensemble d'entraînement
X_train, y_train = smote.fit_resample(X_train, y_train)

<h1>Réduction de dimension</h1>

<h1>Entrainement du modèle</h1>

In [36]:
#Utilisation des poids de classe pour l'entrainement du modèle RandomForest
# Calculer la fréquence des classes
class_counts = y_train.value_counts()
class_freq = class_counts / len(y_train)

# Inverser la fréquence pour obtenir les poids
class_weights = 1 / class_freq

# Créer un dictionnaire des poids
weights_dict = class_weights.to_dict()
print(weights_dict)

{'Indemne': 2.44364813639926, 'Blessé léger': 2.776838562713179, 'Blessé hospitalisé': 4.903570310974648, 'Tué': 37.42378036333945}


In [74]:
# Entrainement du modèle
from time import time

t0 = time()
model = RandomForestClassifier(class_weight=weights_dict, n_jobs= -1, random_state=42)

model.fit(X_train, y_train)
t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

Réalisé en 2025.986 secondes


<h1>Calcul des métriques d'évaluation</h1>

In [75]:
model.score(X_test, y_test)

0.6654725230610802

In [76]:
model.score(X_train, y_train)

0.998942377923622

In [77]:
t0 = time()
y_pred = model.predict(X_test)
t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

print(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))
print(classification_report(y_test, y_pred))

Réalisé en 7920.002 secondes
Classe prédite      Blessé hospitalisé  Blessé léger  Indemne   Tué
Classe réelle                                                      
Blessé hospitalisé               48208         29870    14444  1122
Blessé léger                     24776         99783    40171   312
Indemne                          10233         20848   156163   231
Tué                               8158          1740     1425   864
                    precision    recall  f1-score   support

Blessé hospitalisé       0.53      0.51      0.52     93644
      Blessé léger       0.66      0.60      0.63    165042
           Indemne       0.74      0.83      0.78    187475
               Tué       0.34      0.07      0.12     12187

          accuracy                           0.67    458348
         macro avg       0.57      0.51      0.51    458348
      weighted avg       0.65      0.67      0.66    458348



In [78]:
from imblearn.metrics import geometric_mean_score

geometric_mean_score(y_test, y_pred)

0.3682037755535663

In [79]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5058169767025149

In [80]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc_ovo', verbose=2, n_jobs = -1)

print(f"scores ROC AUC OvO pour chaque pli: {score}")
print(f"Moyenne des scores ROC AUC OvO : {score.mean()}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 11.7min remaining: 17.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.3min finished
5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\maill\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maill\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "C:\Users\maill\anaconda3\lib\site-packages\sklearn\ba

scores ROC AUC OvO pour chaque pli: [nan nan nan nan nan]
Moyenne des scores ROC AUC OvO : nan


In [37]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier(class_weight=weights_dict, n_jobs= -1, random_state=42))]
pipeline = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc_ovo', cv=cv, verbose=2, n_jobs=-1)
score = mean(scores)
print('ROC_AUC score: %.3f' % score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 218.7min finished


ROC_AUC score: 0.819


In [38]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier(class_weight=weights_dict, n_jobs= -1, random_state=42))]
pipeline = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, verbose=2, n_jobs=-1)
score = mean(scores)
print('f1_micro score: %.3f' % score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 189.7min finished


f1_micro score: 0.585


In [40]:
print(pipeline.fit(X_train, y_train).score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.6291227566841988
0.5835718711546685
