In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Charger les fichiers airlines et airports
airlines = pd.read_csv("../data/airlines.csv")  # Suppose une colonne "AIRLINE" existe
airports = pd.read_csv("../data/airports.csv")  # Suppose une colonne "AIRPORT" existe

# Obtenir les listes d'airlines et d'airports
airline_list = airlines['IATA_CODE'].tolist()
airport_list = airports['IATA_CODE'].tolist()

# Fonction pour générer une ligne de données
def generate_row():
    month = random.randint(1, 12)  # Mois entre 1 et 12
    origin_airport = random.choice(airport_list)
    destination_airport = random.choice(airport_list)
    while destination_airport == origin_airport:
        destination_airport = random.choice(airport_list)
    
    departure_delay = round(random.uniform(-20.0, 120.0), 1)  # Retard au départ en minutes
    scheduled_time = round(random.uniform(30.0, 300.0), 1)  # Durée prévue du vol en minutes
    scheduled_arrival = datetime(2024, month, random.randint(1, 28), 
                              random.randint(0, 23), random.randint(0, 59))  # Date et heure départ
    air_system_delay = round(random.uniform(0.0, 30.0), 1)  # Retard système
    security_delay = round(random.uniform(0.0, 15.0), 1)  # Retard sécurité
    airline_delay = round(random.uniform(-10.0, 30.0), 1)  # Retard de la compagnie
    late_aircraft_delay = round(random.uniform(0.0, 20.0), 1)  # Retard avion précédent
    weather_delay = round(random.uniform(0.0, 10.0), 1)  # Retard météo
    airline = random.choice(airline_list)
    departure_time = datetime(2024, month, random.randint(1, 28), 
                              random.randint(0, 23), random.randint(0, 59))  # Date et heure départ
    


    ['MONTH', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DEPARTURE_DELAY',
       'SCHEDULED_TIME', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME_HOUR',
       'DEPARTURE_TIME_MINUTE', 'SCHEDULED_ARRIVAL_HOUR',
       'SCHEDULED_ARRIVAL_MINUTE', 'AIRLINE_AA', 'AIRLINE_AS', 'AIRLINE_B6',
       'AIRLINE_DL', 'AIRLINE_EV', 'AIRLINE_F9', 'AIRLINE_HA', 'AIRLINE_MQ',
       'AIRLINE_NK', 'AIRLINE_OO', 'AIRLINE_UA', 'AIRLINE_US', 'AIRLINE_VX',
       'AIRLINE_WN']
    return {
        "MONTH": month,
        "ORIGIN_AIRPORT": origin_airport,
        "DESTINATION_AIRPORT": destination_airport,
        "DEPARTURE_DELAY": departure_delay,
        "SCHEDULED_TIME": scheduled_time,
        "AIR_SYSTEM_DELAY": air_system_delay,
        "SECURITY_DELAY": security_delay,
        "AIRLINE_DELAY": airline_delay,
        "LATE_AIRCRAFT_DELAY": late_aircraft_delay,
        "WEATHER_DELAY": weather_delay,
        "DEPARTURE_TIME": departure_time,
        "SCHEDULED_ARRIVAL": scheduled_arrival,
        "AIRLINE": airline
    }

# Générer plusieurs lignes
def generate_data(n=100):
    data = [generate_row() for _ in range(n)]
    return pd.DataFrame(data)

# Génération des données
synthetic_data = generate_data(1)  # Exemple pour 10 lignes
print(synthetic_data)




   MONTH ORIGIN_AIRPORT DESTINATION_AIRPORT  DEPARTURE_DELAY  SCHEDULED_TIME  \
0      5            PBI                 GTR            -13.6           265.6   

   AIR_SYSTEM_DELAY  SECURITY_DELAY  AIRLINE_DELAY  LATE_AIRCRAFT_DELAY  \
0              28.8             3.8            0.5                  6.3   

   WEATHER_DELAY      DEPARTURE_TIME SCHEDULED_ARRIVAL AIRLINE  
0            2.2 2024-05-28 11:22:00        2024-05-04      HA  


In [2]:
import sys  
#sys.path.insert(0, 'C:/Users/debgn/MLOPS/backend/src/')
sys.path.insert(0, '../backend/src/')
import preprocessing
from preprocessing import preprocess
a= preprocess(synthetic_data)

Données encodées :    MONTH  ORIGIN_AIRPORT  DESTINATION_AIRPORT  DEPARTURE_DELAY  \
0      5             542                  442            -13.6   

   SCHEDULED_TIME  AIR_SYSTEM_DELAY  SECURITY_DELAY  AIRLINE_DELAY  \
0           265.6              28.8             3.8            0.5   

   LATE_AIRCRAFT_DELAY  WEATHER_DELAY  ...  AIRLINE_EV  AIRLINE_F9  \
0                  6.3            2.2  ...           0           0   

   AIRLINE_HA  AIRLINE_MQ  AIRLINE_NK  AIRLINE_OO  AIRLINE_UA  AIRLINE_US  \
0         1.0           0           0           0           0           0   

   AIRLINE_VX  AIRLINE_WN  
0           0           0  

[1 rows x 28 columns]


In [3]:
a.columns

Index(['MONTH', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DEPARTURE_DELAY',
       'SCHEDULED_TIME', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME_HOUR',
       'DEPARTURE_TIME_MINUTE', 'SCHEDULED_ARRIVAL_HOUR',
       'SCHEDULED_ARRIVAL_MINUTE', 'AIRLINE_AA', 'AIRLINE_AS', 'AIRLINE_B6',
       'AIRLINE_DL', 'AIRLINE_EV', 'AIRLINE_F9', 'AIRLINE_HA', 'AIRLINE_MQ',
       'AIRLINE_NK', 'AIRLINE_OO', 'AIRLINE_UA', 'AIRLINE_US', 'AIRLINE_VX',
       'AIRLINE_WN'],
      dtype='object')

In [4]:
a.to_csv("../data/a.csv",index=False)


In [4]:
import pickle
import requests

def load_airport_encoder_from_git(url):
    """
    Charge l'encodeur des aéroports directement depuis l'URL donnée.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return pickle.loads(response.content)
    else:
        raise ValueError(f"Unable to fetch file. HTTP Status Code: {response.status_code}")

# URL de l'encodeur
airport_encoder_url = "https://dagshub.com/deb.gnuito/MLOPS/raw/main/notebooks/airport_encoder.pickle"

try:
    airport_encoder = load_airport_encoder_from_git(airport_encoder_url)
    print("Airport encoder loaded successfully from DAGsHub.")
    print(airport_encoder)
except ValueError as e:
    print(f"Error loading airport encoder: {e}")


Airport encoder loaded successfully from DAGsHub.
LabelEncoder()


In [5]:
airport_encoder2 = pickle.load(open("../notebooks/airport_encoder.pickle", "rb"))
print(airport_encoder2)


LabelEncoder()


In [7]:
import pandas as pd
df_airlines = pd.read_csv("../data/airlines.csv")
df_aiports = pd.read_csv("../data/airports.csv")

In [16]:
df_airlines.head(1)

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.


In [19]:
df_airlines["AIRLINE"].tolist()

['United Air Lines Inc.',
 'American Airlines Inc.',
 'US Airways Inc.',
 'Frontier Airlines Inc.',
 'JetBlue Airways',
 'Skywest Airlines Inc.',
 'Alaska Airlines Inc.',
 'Spirit Air Lines',
 'Southwest Airlines Co.',
 'Delta Air Lines Inc.',
 'Atlantic Southeast Airlines',
 'Hawaiian Airlines Inc.',
 'American Eagle Airlines Inc.',
 'Virgin America']

IATA_CODE=['UA',
 'AA',
 'US',
 'F9',
 'B6',
 'OO',
 'AS',
 'NK',
 'WN',
 'DL',
 'EV',
 'HA',
 'MQ',
 'VX'],

 AIRLINE:['United Air Lines Inc.',
 'American Airlines Inc.',
 'US Airways Inc.',
 'Frontier Airlines Inc.',
 'JetBlue Airways',
 'Skywest Airlines Inc.',
 'Alaska Airlines Inc.',
 'Spirit Air Lines',
 'Southwest Airlines Co.',
 'Delta Air Lines Inc.',
 'Atlantic Southeast Airlines',
 'Hawaiian Airlines Inc.',
 'American Eagle Airlines Inc.',
 'Virgin America']