In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import KNNImputer
import requests
import os
import datetime

In [101]:
import io

class labels:
    def __init__(self, year:list, airport_file:str):
        self.year = year
        self.airport_file = airport_file
        self.airport = pd.read_csv(airport_file)
        self.selected_airports = None

    def select_airports(self, airport_ids:list):
        """
        Selects airports from the airport file based on given airport IDs.
        
        Args:
            airport_ids (list): List of airport IDs to select.
            
        Returns:
            pandas.DataFrame: DataFrame containing selected airports.
        """
        # Filter the airport DataFrame based on the provided airport IDs
        selected_airports = self.airport[self.airport['Id'].isin(airport_ids)]
        self.selected_airports = selected_airports['Id'].tolist()
        return self.selected_airports

    def fetch_flights(self):
        """
        Fetches the number of flights for given club IDs and year from the WeGlide API.
            
        Returns:
            pandas.DataFrame: DataFrame containing flight information with unique flight IDs
        """
        #Create an empty DataFrame to store flight data
        flights = pd.DataFrame()
        for year in self.year:
            print(year)
            for airport_id in self.selected_airports:
                url = (
                        f"https://api.weglide.org/v1/flight?"
                        f"season_in={year}&"
                        f"airport_id_in={airport_id}&"
                        "contest=free&"
                        "order_by=-scoring_date&"
                        "not_scored=false&"
                        "story=false&"
                        "valid=false&"
                        "skip=0&"
                        "limit=100&"
                        "include_story=true&"
                        "include_stats=false&"
                        "format=csv"
                    )
                    
                response = requests.get(url)
                # Convert the response content to a DataFrame
                flight = pd.read_csv(io.StringIO(response.text))
                # Keep only the column Distance Speed, Date, ID and add it to the flights DataFrame
                flight = flight[['Distance', 'Speed', 'Date', 'ID']]
                #add flight to flights DataFrame
                flights = pd.concat([flights, flight], ignore_index=True)

                
        return flights

    def normalize_date(self, flights):
        """ 
        Normalize the date to YYYY-MM-DD_HH:MM:SS format.
        Args:
            flights (pandas.DataFrame): DataFrame containing flight information with unique flight IDs
        Returns:
            pandas.DataFrame: DataFrame with normalized dates
        """
        # Convert the date column to datetime format with flexible parsing
        flights['Date'] = pd.to_datetime(flights['Date'])
        
        # Check if time information exists in the dates
        has_time = (flights['Date'].dt.hour != 0).any() or (flights['Date'].dt.minute != 0).any()
        
        # Normalize the date to appropriate format
        if has_time:
            flights['Date'] = flights['Date'].dt.strftime('%Y-%m-%d_%H:%M:%S')
        else:
            # If no time information, just add zeros for time
            flights['Date'] = flights['Date'].dt.strftime('%Y-%m-%d_00:00:00')
            
        return flights

    def is_flyable(self, flights):
        """
            If there is at least 1 flight in the day and the distance is greater than 50 km, the day is considered flyable.
            Args:
                flights (pandas.DataFrame): DataFrame containing flight information with unique flight IDs
            Returns:
                pandas.DataFrame:
        """
        # Make a copy of the flights DataFrame to avoid modifying the original
        flights_copy = flights.copy()
        
        # Convert the 'Date' column to datetime format - replace underscore with space for parsing
        flights_copy['Date'] = flights_copy['Date'].str.replace('_', ' ')
        flights_copy['Date'] = pd.to_datetime(flights_copy['Date'])
        
        # Convert Distance from string with comma as decimal separator to float
        flights_copy['Distance'] = flights_copy['Distance'].str.replace(',', '.').astype(float)
        
        # Create a new column 'Flyable' based on the conditions
        flights_copy['Flyable'] = flights_copy.apply(lambda row: 1 if (row['Distance'] > 50) else 0, axis=1)
        
        # Group by date and sum the 'Flyable' column to get the number of flyable days
        flights_per_day = flights_copy.groupby(flights_copy['Date'].dt.date).agg({'Flyable': 'sum'}).reset_index()
        
        # Keep the columns as Date and Flyable for consistency
        flights_per_day.columns = ['Date', 'Flyable']
        
        return flights_per_day

    def final_data(self, flights_per_day):
        """
            Create a new Dataframe with all the dates included in the list self.year.
            If a day corresponds to a date in flights_per_day, the number of flights is kept.
            If not, the number of flights is set to 0.
        """
        # Create a date range for the years in self.year
        start_year = min(self.year)
        end_year = max(self.year)
        date_range = pd.date_range(start=f"{start_year}-01-01", end=f"{end_year}-12-31", freq='D')
        
        # Create a DataFrame with all the dates
        all_dates_df = pd.DataFrame(date_range, columns=['Date'])
        
        # Convert to string format
        all_dates_df['Date'] = all_dates_df['Date'].dt.strftime('%Y-%m-%d')
        
        # Ensure flights_per_day Date column is in the right format
        flights_per_day_copy = flights_per_day.copy()
        flights_per_day_copy['Date'] = pd.to_datetime(flights_per_day_copy['Date']).dt.strftime('%Y-%m-%d')
        
        # Merge with flights_per_day to get the flyable counts
        final_df = pd.merge(all_dates_df, flights_per_day_copy, on='Date', how='left')
        
        # Fill NaN values with 0
        final_df['Flyable'] = final_df['Flyable'].fillna(0).astype(int)
        
        return final_df
            
labels = labels(year=[2024,2023,2022,2021,2020,2019,2018], airport_file='/Users/baptistecaillerie/Documents/Soaring AI/data/airports.csv')


In [92]:
labels.select_airports(airport_ids=[150181,151558,151405,151237,150993])

[150181, 151558, 151405, 150993, 151237]

In [93]:
flights = labels.fetch_flights()
labels.normalize_date(flights)

2024
2023
2022
2021
2020
2019
2018


Unnamed: 0,Distance,Speed,Date,ID
0,10143,4736,2024-09-28_00:00:00,484226
1,7469,4643,2024-09-28_00:00:00,484337
2,14657,6382,2024-09-19_00:00:00,481932
3,5021,2117,2024-09-14_00:00:00,480764
4,587,4058,2024-09-14_00:00:00,480668
...,...,...,...,...
2367,51544,7264,2018-06-29_00:00:00,64020
2368,58699,7935,2018-06-13_00:00:00,64021
2369,42795,8367,2018-05-16_00:00:00,64022
2370,62542,7529,2018-05-13_00:00:00,64023


In [103]:
flights_per_day = labels.is_flyable(flights)
final = labels.final_data(flights_per_day)
labels.normalize_date(final)
final.to_csv('/Users/baptistecaillerie/Documents/Soaring AI/data/flights.csv', index=False)