In [13]:
import pandas as pd
import numpy as np


In [14]:
arquivo_modelos = './airplane/airplanedat.csv'

df_modelos = pd.read_csv(arquivo_modelos, header=None, names=["Modelo", "Coluna2", "Coluna3"])
df_flights = pd.read_csv('viagens/flights_sample_3m_ratings.csv')

In [15]:
# Função para extrair o fabricante da string do modelo
def extrair_fabricante(modelo):
    modelo = str(modelo).strip().replace('"', '')
    return modelo.split()[0]

# Aplica a função à coluna "Modelo"
df_modelos["Fabricante"] = df_modelos["Modelo"].apply(extrair_fabricante)

# Visualiza os primeiros 10 resultados
df_modelos.head(10)

Unnamed: 0,Modelo,Coluna2,Coluna3,Fabricante
0,Aerospatiale (Nord) 262,ND2,N262,Aerospatiale
1,Aerospatiale (Sud Aviation) Se.210 Caravelle,CRV,S210,Aerospatiale
2,Aerospatiale SN.601 Corvette,NDC,S601,Aerospatiale
3,Aerospatiale/Alenia ATR 42-300,AT4,AT43,Aerospatiale/Alenia
4,Aerospatiale/Alenia ATR 42-500,AT5,AT45,Aerospatiale/Alenia
5,Aerospatiale/Alenia ATR 42-600,ATR,AT46,Aerospatiale/Alenia
6,Aerospatiale/Alenia ATR 72,AT7,AT72,Aerospatiale/Alenia
7,Airbus A300,AB3,A30B,Airbus
8,Airbus A300-600,AB6,A306,Airbus
9,Airbus A300-600ST Super Transporter / Beluga,ABB,A3ST,Airbus


In [16]:
# Tabela com percentagens
dados_percentagem = {
    "Boeing": 35,
    "Airbus": 32,
    "Embraer": 8,
    "Bombardier": 6,
    "McDonnell Douglas": 5,
    "ATR": 3,
    "De Havilland": 2,
    "Fokker": 1,
    "Tupolev": 1,
    "Ilyushin": "<1",
    "Sukhoi": "<1",
    "COMAC": "<1",
    "Antonov": "<1",
    "Yakovlev": "<1",
    "British Aerospace": "<1",
    "BAe": "<1",
    "Avro": "<1",
    "Lockheed": "<1",
    "Fairchild Dornier": "<1",
    "Saab": "<1",
    "Harbin": "<1",
    "Partenavia": "<1",
    "Pilatus": "<1",
    "Piper": "<1",
    "Cessna": "<1",
    "Beechcraft": "<1",
    "Hawker": "<1",
    "Learjet": "<1",
    "Aerospatiale": "<1",
    "Concorde": 0,
    "NAMC": 0,
    "Sikorsky": "N/A",
    "Bell": "N/A"
}

In [17]:
df_percentagem = pd.DataFrame(list(dados_percentagem.items()), columns=["Fabricante", "Percentagem de Uso Comercial (%)"])

In [18]:
# Merge dos datasets com base no nome do fabricante
df_final = pd.merge(df_modelos, df_percentagem, how="left", left_on="Fabricante", right_on="Fabricante")

In [19]:
df_final = df_final.drop(columns=["Coluna2", "Coluna3"])


In [20]:
df_final["id"] = range(1, len(df_final) + 1)


In [21]:
df_avioes = df_final


In [22]:
def percentagem_valida(valor):
    try:
        valor_str = str(valor).replace('%', '').strip()
        if '<' in valor_str or 'N/A' in valor_str:
            return False
        return float(valor_str) >= 1
    except:
        return False


In [23]:
# 2. Filtrar apenas os aviões que têm percentagem numérica (descartar 'N/A', '<1', etc)
df_avioes_filtrado = df_avioes[df_avioes["Percentagem de Uso Comercial (%)"].apply(percentagem_valida)].copy()
df_avioes_filtrado.head(10)



Unnamed: 0,Modelo,Fabricante,Percentagem de Uso Comercial (%),id
7,Airbus A300,Airbus,32,8
8,Airbus A300-600,Airbus,32,9
9,Airbus A300-600ST Super Transporter / Beluga,Airbus,32,10
10,Airbus A310,Airbus,32,11
11,Airbus A318,Airbus,32,12
12,Airbus A319,Airbus,32,13
13,Airbus A319neo,Airbus,32,14
14,Airbus A320,Airbus,32,15
15,Airbus A320neo,Airbus,32,16
16,Airbus A321,Airbus,32,17


In [24]:
# 3. Converter a percentagem para número
df_avioes_filtrado["Peso"] = df_avioes_filtrado["Percentagem de Uso Comercial (%)"].astype(float)

# 4. Normalizar os pesos para somarem 1 (distribuição de probabilidade)
df_avioes_filtrado["Peso Normalizado"] = df_avioes_filtrado["Peso"] / df_avioes_filtrado["Peso"].sum()

# 5. Gerar amostra aleatória com base nas percentagens
avioes_amostrados = np.random.choice(
    df_avioes_filtrado["Modelo"], 
    size=len(df_flights), 
    p=df_avioes_filtrado["Peso Normalizado"]
)

# 6. Adicionar ao dataframe de voos
df_flights["Modelo Avião"] = avioes_amostrados

# 7. Salvar o resultado
df_flights.to_csv("viagens/flights_ratings_modelos.csv", index=False)
df_flights.head(10)

Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT,Rating,Modelo Avião
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",...,176.0,153.0,1065.0,,,,,,2.4,Airbus A320
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",...,236.0,189.0,1399.0,,,,,,7.1,Airbus A300
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",...,112.0,87.0,680.0,,,,,,4.2,Airbus A340
3,2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",...,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0,3.3,Boeing 767-200
4,2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",...,182.0,153.0,985.0,,,,,,0.5,Airbus A330-200
5,2019-07-31,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,665,DAL,"Dallas, TX",OKC,"Oklahoma City, OK",...,54.0,36.0,181.0,141.0,0.0,0.0,0.0,0.0,4.8,Boeing 747-400
6,2023-06-11,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2134,DCA,"Washington, DC",BOS,"Boston, MA",...,89.0,58.0,399.0,,,,,,5.5,Boeing 787-10
7,2019-07-08,Republic Airline,Republic Airline: YX,YX,20452,4464,HSV,"Huntsville, AL",DCA,"Washington, DC",...,151.0,88.0,613.0,0.0,0.0,23.0,0.0,0.0,3.3,Boeing 727
8,2023-02-12,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,590,IAH,"Houston, TX",LAX,"Los Angeles, CA",...,219.0,200.0,1379.0,,,,,,2.2,Airbus A330
9,2020-08-22,Alaska Airlines Inc.,Alaska Airlines Inc.: AS,AS,19930,223,SEA,"Seattle, WA",FAI,"Fairbanks, AK",...,220.0,198.0,1533.0,,,,,,1.8,Airbus A330-200
