In [1]:
import os 
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import models
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.layers.experimental.preprocessing import Normalization

In [2]:
def charac_clean_data_12_18():
    """ Fonction qui nettoies les données des années 2011 à 2018
        et retourne un data frame de toutes ces données
    """
    def update_format(dep):
        if dep > 90:
            dep = str(dep)
            return dep[:2]
        dep = str(dep)
        return dep[:1]
    directory = "data/raw_data/carasteristiques/1"
    files = [file for file in os.listdir(directory) if file.startswith("caracteristiques_") and file.endswith(".csv")]
    data = pd.DataFrame()
    for file in files:
        df = pd.read_csv(os.path.join(directory, file), sep=',', encoding='ISO-8859-1', engine='python')
        data = pd.concat([data, df], ignore_index=True)
    data = data.drop(columns=["hrmn", "com", "adr", "lat", "long", "gps"])
    data["dep"] = data["dep"].apply(update_format)
    data = data.astype({'dep': int})
    deps_to_delete = [971, 972, 973, 974, 976, 201, 202, 97]
    data = data.drop(data[data['dep'].isin(deps_to_delete)].index)
    print("Cleaning Caractéristique de 2012 à 2018 -> Done")
    return data

def charac_clean_data_19_21():
    """ Fonction qui nettoies les données des années 2019 à 2021
        et retourne un data frame de toutes ces données
    """
    print("Cleaning Characteristics de 2019 à 2021 ...")
    directory = "data/raw_data/carasteristiques/2"
    files = [file for file in os.listdir(directory) if file.startswith("caracteristiques_") and file.endswith(".csv")]
    data = pd.DataFrame()

    for file in files:
        df2 = pd.read_csv(os.path.join(directory, file), sep=';', encoding='ISO-8859-1', engine='python')
        data = pd.concat([data, df2], ignore_index=True)

    data = data.drop(columns=[ "hrmn", "com", "adr", "lat", "long"])
    deps_to_delete = ["972", "2B", "973", "2A", "987", "986", "971", "977", "978", "975", "988", "976", "974" ]
    data = data.drop(data[data['dep'].isin(deps_to_delete)].index)
    data = data.astype({'dep': int})
    print("Cleaning Characteristics de 2019 à 2021 -> Done")
    return data

def concatenate_function(name, data_1 :pd.DataFrame, data_2 :pd.DataFrame) -> pd.DataFrame:
    """ Fusionne les deux data fram de caracéristique et retourne le
        data farm avec les accidents dans lo'dre d'arrivé
    """
    print(f"Fusion des Dataframe {name} 2011-2018 et 2019-2021 ...")
    return pd.concat([data_1, data_2], ignore_index=True).sort_values(by="Num_Acc")

def clean_characteristics_data():
    data_carac = concatenate_function("Characteristics", charac_clean_data_12_18(), charac_clean_data_19_21())
    print(f"Fusion des Dataframe Characteristics 2011-2018 et 2019-2021 -> Done")
    return data_carac

In [3]:
def prepare_data_for_model(data:pd.DataFrame):
    print("Preparing data for the model -> ...")
    data.an = data.an.map({11:11,
                           12:12,
                           13:13,
                           14:14,
                           15:15,
                           16:16,
                           17:17,
                           18:18,
                           2019:19,
                           2020:20,
                           2021:21})
    data['date'] = pd.to_datetime(data['an']*10000 + data['mois']*100 + data['jour'], format='%y%m%d')
    data = data.drop(columns=['an', 'mois','jour', 'lum', 'agg', 'int', 'atm', 'col'])
    print("Preparing data for the model -> Done")

    return data

In [4]:
data = clean_characteristics_data()

Cleaning Caractéristique de 2012 à 2018 -> Done
Cleaning Characteristics de 2019 à 2021 ...
Cleaning Characteristics de 2019 à 2021 -> Done
Fusion des Dataframe Characteristics 2011-2018 et 2019-2021 ...
Fusion des Dataframe Characteristics 2011-2018 et 2019-2021 -> Done


In [5]:
data = prepare_data_for_model(data)

Preparing data for the model -> ...
Preparing data for the model -> Done


In [6]:
data =pd.pivot_table(data, values='Num_Acc', index='date', columns='dep', aggfunc='count')

In [7]:
data =data.reset_index().resample('W', on='date').count()

In [8]:
def create_split(data, split_value):   
    # Calculer le nombre de lignes à inclure dans l'ensemble d'entraînement
    train_size = int(len(data) * split_value)

    # Diviser le dataframe en ensembles d'entraînement et de test en utilisant iloc
    train_df = data.iloc[:train_size,:]
    test_df = data.iloc[train_size:,:]

    return train_df, test_df

In [9]:
data_train, data_test = create_split(data, 0.7)

In [10]:
data_train

dep,1,2,3,4,5,6,7,8,9,10,...,86,87,88,89,90,91,92,93,94,95
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-02,1,1,0,0,0,2,1,0,0,0,...,0,1,0,0,0,2,1,2,2,2
2011-01-09,5,3,1,2,0,7,1,1,0,3,...,6,7,3,4,5,7,7,7,7,7
2011-01-16,3,3,5,1,3,7,2,3,2,3,...,4,6,2,3,3,7,7,7,7,6
2011-01-23,4,4,3,3,1,6,2,2,1,6,...,4,5,3,4,3,7,7,7,7,7
2011-01-30,4,4,4,2,0,7,1,0,2,2,...,3,2,2,6,2,6,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-08-12,4,4,3,3,1,7,4,2,1,4,...,6,4,4,3,1,5,7,7,7,4
2018-08-19,4,3,3,5,4,7,3,4,0,5,...,4,4,5,4,1,7,7,7,7,4
2018-08-26,4,1,4,5,5,7,3,1,3,5,...,2,5,2,2,2,5,7,7,7,3
2018-09-02,5,4,2,4,3,7,4,3,2,3,...,5,7,1,3,1,7,7,7,7,6


In [11]:
data_test

dep,1,2,3,4,5,6,7,8,9,10,...,86,87,88,89,90,91,92,93,94,95
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-16,5,2,3,4,3,7,3,4,4,7,...,4,4,2,4,0,7,7,7,7,7
2018-09-23,6,2,3,1,6,7,3,1,2,6,...,3,5,2,3,1,7,7,7,7,7
2018-09-30,4,4,1,4,2,7,3,3,3,7,...,4,6,5,3,0,7,7,7,7,6
2018-10-07,5,6,2,4,3,7,2,3,3,5,...,2,3,3,2,2,6,7,7,7,7
2018-10-14,6,5,4,4,5,7,6,1,1,5,...,3,3,2,5,2,6,6,7,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-05,5,2,4,2,3,6,2,2,2,5,...,3,5,1,4,0,7,7,7,7,7
2021-12-12,5,4,5,2,4,6,2,1,3,5,...,3,6,1,2,0,7,7,7,7,7
2021-12-19,5,4,4,4,4,6,3,3,3,6,...,2,6,1,2,0,7,7,7,7,7
2021-12-26,4,5,2,3,3,6,3,1,3,3,...,3,4,2,1,2,6,7,7,7,7


In [12]:
def create_sequences(data, dep, input_length=104, output_length=52, gap=0):
    """
    Découpe les données en séquences de taille spécifiée avec un décalage d'un mois à chaque fois.

    :param data: DataFrame contenant les données à découper en séquences
    :param sequence_length: Longueur des séquences
    :return: Numpy array contenant les séquences
    """
    data = data[[dep]]
    X, y = [], []
    
    
    for i in range(len(data) - (input_length + output_length + gap)): # Pour i = 0
        Xi = data.iloc[i:i + input_length].values # [0:24]
        X.append(Xi)
        yi = data.iloc[i + input_length + gap : i + input_length + gap + output_length].values # [24+0:36]
        y.append(yi)
    

    return np.array(X),np.array(y)

In [13]:
def get_Xy(data, shuffle = True, input_length=12, output_length=4, gap=0):
    X, y = [], []
    for i in range(1, 96):
        Xi, yi = create_sequences(data, i, input_length, output_length, gap)
        # if len(yi) < output_length:
        #     break
        X.append(Xi)
        y.append(yi)           
        
    X = np.array(X)
    y = np.array(y)
    y = np.squeeze(y)
    
    X = X.reshape(-1, input_length, 1)
    y = y.reshape(-1, output_length)

    if shuffle:
        idx = np.arange(len(X))
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        
    return X, y

In [14]:
X_train, y_train = get_Xy(data_train)

In [15]:
X_train.shape

(23370, 104, 1)

In [16]:
X_test, y_test = get_Xy(data_test)

In [17]:
X_test.shape

(1615, 104, 1)

In [18]:
def dummy_predictor(X,output_length=52):
    X =  X.mean(axis=1)
    if output_length==1 : return X
    res = np.append(X,X,axis=1)
    for _ in range(output_length-2):
        res = np.append(res,X,axis=1)
    return res

y_pred_baseline=dummy_predictor(X_train)
print(y_pred_baseline.shape)
print(y_train.shape)
print(f"mae : {mean_absolute_error(y_pred_baseline,y_train)}")
print(f"mape : {mean_absolute_percentage_error(y_pred_baseline,y_train)}")

y_pred_baseline=dummy_predictor(X_test)
print(y_pred_baseline.shape)
print(y_test.shape)
print(f"mae : {mean_absolute_error(y_pred_baseline,y_test)}")
print(f"mape : {mean_absolute_percentage_error(y_pred_baseline,y_test)}")

(23370, 52)
(23370, 52)
mae : 0.926076409724701
mape : 0.2969990027591004
(1615, 52)
(1615, 52)
mae : 0.9950361349771917
mape : 0.33103977003182933


In [20]:
def init_model(X_train, y_train):    
    # 0 - Normalization
    # ======================    
    normalizer = Normalization(input_shape=(104,1))
    normalizer.adapt(X_train)
    
    # 1 - RNN architecture
    # ======================    
    model = models.Sequential()

    model.add(normalizer)
    # ## 1.1 - Recurrent Layer
    model.add(layers.LSTM(64, 
                          activation='tanh', 
                          return_sequences = False,
                          recurrent_dropout = 0.3))
    ## 1.2 - Predictive Dense Layers
    output_length = y_train.shape[1]
    model.add(layers.Dense(output_length, activation='linear'))

    # # 2 - Compiler
    # # ======================    
    rmsprop = optimizers.RMSprop(learning_rate=0.02)    
    model.compile(loss='mse', optimizer=rmsprop, metrics=["mae","mape"])
    
    return model

model = init_model(X_train, y_train)
model.summary()

es = EarlyStopping(patience=10, restore_best_weights=True)
model.fit(X_train, y_train, epochs=100, batch_size=16, verbose=1, validation_split=0.2, callbacks = [es])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_1 (Normalizat  (None, 104, 1)           3         
 ion)                                                            
                                                                 
 lstm_1 (LSTM)               (None, 64)                16896     
                                                                 
 dense_1 (Dense)             (None, 52)                3380      
                                                                 
Total params: 20,279
Trainable params: 20,276
Non-trainable params: 3
_________________________________________________________________
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 