In [57]:
import pandas as pd

import mysql.connector

from sqlalchemy import create_engine

from skimpy import skim

from datetime import datetime


# 0- importation des données en df

In [58]:
username = 'postgres'
password = 'admin'
host = '127.0.0.1'  # ou l'adresse IP de votre serveur PostgreSQL
port = '5432'  # le port par défaut pour PostgreSQL est 5432
database = 'preventive_maintenance'

# Créer l'URL de connexion
db_url = f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}'

# Créer le moteur SQLAlchemy
engine = create_engine(db_url)

# Se connecter à la base de données
connexion = engine.connect()


In [59]:
df_aero = pd.read_sql_query('''
                  SELECT * 
                  FROM aeronefs
                  ''', connexion)


In [60]:
df_composants = pd.read_sql_query('''
                  SELECT * 
                  FROM composants
                  ''', connexion)

In [61]:
df_degradations = pd.read_sql_query('''
                  SELECT * 
                  FROM degradations
                  ''', connexion)

In [62]:
df_logs_vols = pd.read_sql_query('''
                  SELECT * 
                  FROM logs_vols
                  ''', connexion)

In [63]:
connexion.close()

# 1- merge des dataframes

In [64]:
# merge du degradations et composants en left join avec degradations en left

df_merge_deg_compo = pd.merge(df_degradations, df_composants, how='inner', left_on='compo_concerned', right_on='ref_compo')

In [65]:
df_merge_deg_compo.head(1)

Unnamed: 0,clef,ref_deg,linked_aero,compo_concerned,usure_nouvelle,measure_day,need_replacement,ref_compo,categorie,aero,desc_compo,lifespan,taux_usure_actuel,cout
0,D003661E170_6353REAE170-E170_6353-02024-06-04,D003661,E170_6353,REAE170-E170_6353-0,54.20848,2024-06-04,False,REAE170-E170_6353-0,Composants Critiques,E170_6353,Réacteur gauche,13045,53.308375,15552


In [66]:
# faire un group by par categoriee et ref_aero
df_merge_deg_compo_gb = df_merge_deg_compo.groupby(['aero', 'categorie', 'measure_day']).agg({'usure_nouvelle': ['max', 'mean']}).reset_index()

# Aplatir les colonnes
df_merge_deg_compo_gb.columns = ['_'.join(col).strip().replace(' ', '_') if col[1] else col[0] for col in df_merge_deg_compo_gb.columns.values]


In [67]:
# faire un pivot pour obtenir une seule ligne par avion
df_pivot = df_merge_deg_compo_gb.pivot_table(index=['aero', 'measure_day'], columns=['categorie'], values=['usure_nouvelle_max', 'usure_nouvelle_mean'], aggfunc='first').reset_index()

df_pivot.columns = ['_'.join([str(i) for i in col]).strip().replace(' ', '_')  for col in df_pivot.columns.values]


In [68]:
df_pivot['measure_day_'] = df_pivot['measure_day_'].astype('str')

In [69]:
# ajout d'une clé aero et mesure_day
df_pivot['cle'] = df_pivot.apply(lambda x: x['aero_'] + '_' + x['measure_day_'], axis=1)

In [70]:
df_logs_vols

Unnamed: 0,ref_vol,aero_linked,jour_vol,time_en_air,sensor_data,etat_voyant,temp,temp_unit,pressure,pressure_unit,vibrations,vibrations_unit,prediction_etat_voyant
0,V06783026,A320_6242,2024-06-04,6.2,"{'temp': '-14.7°C', 'pressure': '1009.1 hPa', ...",1,-14.7,°C,1009.1,hPa,0.015340,m/s²,0
1,V06810954,A380_5199,2024-06-04,2.1,"{'temp': '2.9°C', 'pressure': '1015.8 hPa', 'v...",0,2.9,°C,1015.8,hPa,2.317764,m/s²,0
2,V05201226,A350_3122,2024-06-04,7.7,"{'temp': '8.0°C', 'pressure': '934.2 hPa', 'vi...",0,8.0,°C,934.2,hPa,1.999378,m/s²,0
3,V05582404,B747_3165,2024-06-04,8.3,"{'temp': '7.3°C', 'pressure': '999.9 hPa', 'vi...",0,7.3,°C,999.9,hPa,0.875383,m/s²,0
4,V01713095,A350_6452,2024-06-04,9.8,"{'temp': '-11.4°C', 'pressure': '946.7 hPa', '...",0,-11.4,°C,946.7,hPa,1.551531,m/s²,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237,V08907808,A350_6495,2024-07-25,6.6,"{'temp': '13.8°C', 'pressure': '899.0 hPa', 'v...",0,13.8,°C,899.0,hPa,1.876524,m/s²,0
5238,V01846975,E175_5414,2024-07-25,2.7,"{'temp': '-24.2°C', 'pressure': '1051.4 hPa', ...",1,-24.2,°C,1051.4,hPa,0.888896,m/s²,0
5239,V09385402,E175_0620,2024-07-25,5.8,"{'temp': '1.9°C', 'pressure': '907.5 hPa', 'vi...",0,1.9,°C,907.5,hPa,3.648072,m/s²,0
5240,V04742938,B747_1460,2024-07-25,3.7,"{'temp': '-20.2°C', 'pressure': '1074.7 hPa', ...",1,-20.2,°C,1074.7,hPa,2.406679,m/s²,0


In [71]:
df_logs_vols['jour_vol'] = df_logs_vols['jour_vol'].astype('str')

In [72]:
# ajout d'une clé aero et mesure_day sur le df_logs_vols
df_logs_vols['cle'] = df_logs_vols.apply(lambda x: x['aero_linked'] + '_' + x['jour_vol'], axis=1)

In [73]:
# merge de df_log_vol et df_pivot

df_merge_pivot_logvol = pd.merge(df_pivot, df_logs_vols, how='inner', left_on='cle', right_on='cle')

In [74]:
# merge de df_aero et df_merge_pivot_logvol

df_merge_tot = pd.merge(df_merge_pivot_logvol, df_aero, how='inner', left_on='aero_linked', right_on='ref_aero')

In [75]:
df_merge_tot.shape

(5216, 28)

In [76]:
del df_aero, df_logs_vols, df_merge_pivot_logvol, df_degradations, df_composants

# 2- Nettoayage du df

In [77]:
# df_merge_tot.columns

In [78]:
# suppression des colonne n'apportant pas d'informations
df_merge_tot = df_merge_tot.drop(columns=[
                            'cle', 'ref_vol',
                            'aero_linked', 'jour_vol', 'sensor_data', 
                            'temp_unit', 'pressure_unit',
                            'vibrations_unit', 'ref_aero', 
                            'end_maint'])

In [79]:
pd.set_option("display.max_columns", None)

In [80]:
# skim(df_merge_total)

In [81]:
# modification du type pour date 
df_merge_tot['debut_service'] = pd.to_datetime(df_merge_tot['debut_service'], format='%Y-%m-%d')
df_merge_tot['last_maint'] = pd.to_datetime(df_merge_tot['last_maint'], format='%Y-%m-%d')
df_merge_tot['measure_day'] = pd.to_datetime(df_merge_tot['measure_day_'], format='%Y-%m-%d')

In [82]:
# modif type category
df_merge_tot['etat_voyant'] = df_merge_tot['etat_voyant'].astype('category')  

In [83]:
# Ajout d'une colonne temps de vol cumulé depuis la derniere maintenance

# Assumons que df_merge_total est le DataFrame fourni et qu'il contient les colonnes 'temps_de_vol', 'last_maint', 'jour_vol', et 'ref_aero'
# Trier le DataFrame par avion et par date de vol
df_merge_total_sort = df_merge_tot.sort_values(by=['aero_', 'measure_day']).reset_index(drop=True)

# Initialiser la colonne 'temps_de_vol_cumule' et 'etat_voyant_suivant' avec des zéros
df_merge_total_sort['temps_de_vol_cumule'] = 0
df_merge_total_sort['etat_voyant_suivant'] = 0

# Parcourir chaque avion
for avion in df_merge_total_sort['aero_'].unique():
    
    # Filtrer les données pour l'avion actuel
    df_avion = df_merge_total_sort[df_merge_total_sort['aero_'] == avion]
    
    # Variable pour stocker le temps de vol cumulé
    temps_de_vol_cumule = 0
    
    # Parcourir chaque ligne du DataFrame filtré pour l'avion actuel
    for i in df_avion.index:
      
        # Si la maintenance a eu lieu (en supposant que 'last_maint' est un booléen ou indicatif)
        if (i != 0 and ((df_merge_total_sort.loc[i, 'last_maint'].date() == df_merge_total_sort.loc[i, 'measure_day'].date()) or
            (df_merge_total_sort.loc[i-1, 'etat_voyant'] != 0))):
            # Réinitialiser le temps de vol cumulé
            temps_de_vol_cumule = 0
        
        # Ajouter le temps de vol actuel au temps de vol cumulé
        temps_de_vol_cumule += df_merge_total_sort.loc[i, 'time_en_air']
        
        # Mettre à jour la colonne 'temps_de_vol_cumule'
        df_merge_total_sort.loc[i, 'temps_de_vol_cumule'] = temps_de_vol_cumule

        if i < len(df_merge_total_sort)-1:
            df_merge_total_sort.loc[i,'etat_voyant_suivant'] = df_merge_total_sort.loc[i+1, 'etat_voyant'] 








  df_merge_total_sort.loc[i, 'temps_de_vol_cumule'] = temps_de_vol_cumule


In [84]:
df_merge_total_sort.head(5)

Unnamed: 0,aero_,measure_day_,usure_nouvelle_max_Composants_Critiques,usure_nouvelle_max_Composants_Majeurs,usure_nouvelle_max_Composants_Secondaires,usure_nouvelle_mean_Composants_Critiques,usure_nouvelle_mean_Composants_Majeurs,usure_nouvelle_mean_Composants_Secondaires,time_en_air,etat_voyant,temp,pressure,vibrations,prediction_etat_voyant,type_model,debut_service,last_maint,en_maintenance,measure_day,temps_de_vol_cumule,etat_voyant_suivant
0,A320_1884,2024-06-04,61.12666,60.78174,59.68462,28.15746,32.796355,29.171363,9.8,0,4.3,907.3,1.763677,0,A320,2015-12-26,2024-06-01,False,2024-06-04,9.8,1
1,A320_1884,2024-06-05,62.128398,61.547122,60.747186,29.045238,33.699044,30.416709,7.4,1,-24.3,905.9,3.428686,0,A320,2015-12-26,2024-06-01,False,2024-06-05,17.2,0
2,A320_1884,2024-06-08,62.466823,61.805697,61.106161,29.345163,34.004007,30.837434,2.5,0,-8.4,962.5,1.290613,0,A320,2015-12-26,2024-06-01,False,2024-06-08,2.5,0
3,A320_1884,2024-06-10,62.548045,61.867755,61.192315,29.417145,34.077197,30.938408,0.6,0,17.4,872.1,3.525162,0,A320,2015-12-26,2024-06-01,False,2024-06-10,3.1,0
4,A320_1884,2024-06-14,63.035377,62.240103,61.709239,29.849037,34.516343,31.544252,3.6,0,18.3,986.5,0.973155,0,A320,2015-12-26,2024-06-01,False,2024-06-14,6.7,0


In [85]:
# skim(df_merge_total)

In [86]:
df_merge_total_sort.to_parquet('data_preclean_avion.parquet')