In [1]:
# Importing libraries
import pandas as pd
import requests
import io
import warnings
import time

# Set warnings off
warnings.filterwarnings("ignore")

In [2]:
# URLs of every dataset
# I am not going to use the dataset of 2024 because it have not the entire information
urls_data = {
    '2019' : 'https://datos.transporte.gob.ar/dataset/21038a1a-c3c7-4494-b76a-3a2a8fbb83b5/resource/aa8337de-3565-4ecf-9cd9-6f1c61f8f0ed/download/2019_informe_ministerio.csv',
    '2020' : 'https://datos.transporte.gob.ar/dataset/21038a1a-c3c7-4494-b76a-3a2a8fbb83b5/resource/d0e75e7d-e416-470f-bedb-ef2a877cbae3/download/2020_informe_ministerio.csv',
    '2021' : 'https://datos.transporte.gob.ar/dataset/21038a1a-c3c7-4494-b76a-3a2a8fbb83b5/resource/1b4f569a-cab0-4560-993c-5bf96c3e7cf0/download/202112_informe_ministerio.csv',
    '2022' : 'https://datos.transporte.gob.ar/dataset/21038a1a-c3c7-4494-b76a-3a2a8fbb83b5/resource/11894a35-de36-4579-b084-d1191f551fbe/download/202212-informe-ministerio.csv',
    '2023' : 'https://datos.transporte.gob.ar/dataset/21038a1a-c3c7-4494-b76a-3a2a8fbb83b5/resource/e910fead-ade3-40ce-ae8f-cad2017aa007/download/202312-informe-ministerio-actualizado-dic.csv',
}

In [3]:
# Creating a dataset from every year
for year, url in urls_data.items():
    response = requests.get(url)
    if response.status_code == 200:
        # The dataset of 2021 has ',' as delimiter
        delimiter = ',' if year == '2021' else ';'
        # Reading the dataset
        df = pd.read_csv(io.BytesIO(response.content), delimiter=delimiter)
        # Asign the dataset to a variable
        vars()[f'df_{year}'] = df
    else:
        print(f'Error while download the dataset from {year}. State Code: {response.status_code}')

In [4]:
# Checking the general info of the dataset
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580774 entries, 0 to 580773
Data columns (total 12 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Fecha                               580774 non-null  object
 1   Hora UTC                            580774 non-null  object
 2   Clase de vuelos (todos los vuelos)  580774 non-null  object
 3   Clasificacion Vuelo                 580774 non-null  object
 4   Tipo Movimiento                     580771 non-null  object
 5   Aeropuerto                          580774 non-null  object
 6   Origen/Destino                      580771 non-null  object
 7   Aerolinea Nombre                    398367 non-null  object
 8   Aeronave                            489113 non-null  object
 9   Pasajeros                           580774 non-null  int64 
 10  PAX                                 580774 non-null  int64 
 11  Calidad del dato                    580

In [5]:
df_2020.head(10)

Unnamed: 0,Fecha,Hora UTC,Clase de Vuelo (todos los vuelos),Clasificación Vuelo,Tipo de Movimiento,Aeropuerto,Origen / Destino,Aerolinea Nombre,Aeronave,Pasajeros,PAX,Calidad dato
0,1/1/2020,00:06,Regular,Internacional,Aterrizaje,EZE,LEMD,IBERIA - LINEAS AÉREAS DE ESPAÑA,0,239,239,DEFINITIVO
1,1/1/2020,00:08,Regular,Internacional,Despegue,EZE,SCEL,LAN ARGENTINA S.A. (LATAM AIRLINES),0,152,152,DEFINITIVO
2,1/1/2020,00:10,Regular,Doméstico,Aterrizaje,PAL,BAR,JETSMART AIRLINES S.A.,0,116,58,DEFINITIVO
3,1/1/2020,00:13,Regular,Internacional,Despegue,EZE,KDFW,AMERICAN AIRLINES INC.,0,255,255,DEFINITIVO
4,1/1/2020,00:13,Regular,Doméstico,Aterrizaje,PAL,DOZ,JETSMART AIRLINES S.A.,0,146,73,DEFINITIVO
5,1/1/2020,00:14,Regular,Doméstico,Aterrizaje,EZE,TUC,AUSTRAL LINEAS AEREAS-CIELOS DEL SUR S.A,0,74,37,DEFINITIVO
6,1/1/2020,00:15,Regular,Internacional,Aterrizaje,AER,SUMU,AUSTRAL LINEAS AEREAS-CIELOS DEL SUR S.A,0,20,20,DEFINITIVO
7,1/1/2020,00:18,Regular,Doméstico,Aterrizaje,PAL,TUC,FB LÍNEAS AÉREAS - FLYBONDI,0,194,97,DEFINITIVO
8,1/1/2020,00:20,Regular,Internacional,Despegue,EZE,KATL,DELTA AIRLINES,0,210,210,DEFINITIVO
9,1/1/2020,00:22,Regular,Doméstico,Aterrizaje,PAL,IGU,FB LÍNEAS AÉREAS - FLYBONDI,0,182,91,DEFINITIVO


Feature Engineering

In [6]:
# Insert all datasets into a new variable
data = [df_2019, df_2020, df_2021, df_2022, df_2023]

# Checking if the datasets has the same column names. First looping the number of the columns
for i in range(12):
    # Creating a set to see the unique column names for each column
    columns_unique = set()
    # Looping the datasets
    for dataset in data:
        # Adding the column name to the set
        columns_unique.add(dataset.columns[i])

    print(f'{len(columns_unique)}: {columns_unique}') if len(columns_unique) > 1 else None
    print('*************************************')

2: {'Fecha', 'Fecha UTC'}
*************************************
*************************************
2: {'Clase de Vuelo (todos los vuelos)', 'Clase de vuelos (todos los vuelos)'}
*************************************
2: {'Clasificación Vuelo', 'Clasificacion Vuelo'}
*************************************
2: {'Tipo de Movimiento', 'Tipo Movimiento'}
*************************************
*************************************
2: {'Origen/Destino', 'Origen / Destino'}
*************************************
*************************************
*************************************
*************************************
*************************************
2: {'Calidad dato', 'Calidad del dato'}
*************************************


In [7]:
# Rename the columns
for i in range(len(data)):
    data[i].rename(columns={
        'Fecha UTC': 'Fecha',
        'Hora UTC': 'Hora',
        'Clase de Vuelo (todos los vuelos)': 'Clase de Vuelo',
        'Clase de vuelos (todos los vuelos)': 'Clase de Vuelo',
        'Clasificacion Vuelo': 'Clasificación Vuelo',
        'Tipo de Movimiento': 'Tipo Movimiento',
        'Origen / Destino': 'Origen/Destino',
        'Calidad del dato': 'Calidad dato'
    }, inplace=True)

In [8]:
# Checking if the datasets has the same column names. First looping the number of the columns
for i in range(12):
    # Creating a set to see the unique column names for each column
    columns_unique = set()
    # Looping the datasets
    for dataset in data:
        # Adding the column name to the set
        columns_unique.add(dataset.columns[i])

    print(f'{len(columns_unique)}: {columns_unique}') if len(columns_unique) > 1 else None
    print('*************************************')

*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************
*************************************


In [9]:
# Checking if all datasets has the same number of columns.
number_columns = []
for dataset in data:
    number_columns.append(len(dataset.columns))

print(number_columns)

[12, 12, 12, 12, 12]


In [10]:
# Merging all the datasets into 'df'
df = pd.concat(data, ignore_index=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2155873 entries, 0 to 2155872
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   Fecha                object
 1   Hora                 object
 2   Clase de Vuelo       object
 3   Clasificación Vuelo  object
 4   Tipo Movimiento      object
 5   Aeropuerto           object
 6   Origen/Destino       object
 7   Aerolinea Nombre     object
 8   Aeronave             object
 9   Pasajeros            int64 
 10  PAX                  object
 11  Calidad dato         object
dtypes: int64(1), object(11)
memory usage: 197.4+ MB


In [12]:
# Delete column 'Aeronave'
# del(df['Aeronave'])

In [13]:
# Creating a dataframe endpoint to register the process
df.to_csv('Data_FeatureFinish.csv', index=False)