In [49]:
import findspark
findspark.init()
findspark.find()

from pyspark import SparkContext

from pyspark.sql import SparkSession

from pyspark import SparkConf

import pyspark.sql as pysql

from pyspark import StorageLevel

In [50]:
import pyspark.ml as pyml
import pyspark.sql.functions as pysqlfun
import pyspark.sql.types as pysqltypes

In [51]:
import pathlib

# Introducción

Para la materia de Datos Masivos de la Maestría de Ciencias de Datos realicé este notebook de Jupyter el cual predice si tu vuelo será cancelado o no.

Los datos se obtuvieron en <https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022> la cual consiste en vuelos nacionales en Estados Unidos del 2018 al 2022.

In [52]:
conf = SparkConf().setAppName("Tarea") \
    .setMaster("local[8]") \
    .set("spark.executor.cores","2") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory","4g")

In [53]:
spark : SparkSession  = SparkSession.builder.config(conf=conf).getOrCreate()

# Inputs

In [54]:
distance = float(5000.0) #mayor a 0 (en millas)
year = 2024 #año
month = 11 #mes (1 a 12)
day_of_month = 28 #dia del mes del vuelo
day_of_week = 5 #Dia de la semana del vuelo (1 domingo a 7 sabado)
airline = "United Air Lines Inc." #aerolinea, revisar valores en airlines.csv
tail_number = "N480HA" #Código de identificación del avión, revisar valores en df_tail_number.csv
origin = "LAX" #Código del aeropuerto de origen, revisar valores en airports_origin.csv y/o df_origin.csv
dest = "JFK" #Código del aeropuerto de destino, revisar valores en airports_dest.csv y/o df_dest.csv
origin_state_name = "California" #Nombre del estado de origen del vuelo, revisar valores en df_origin_state.csv
dest_state_name = "New York" #Nombre del estado de destino del vuelo, revisar valores en df_dest_state.csv


Cargar a dataframe

In [55]:
df_input = spark.createDataFrame(
    [pysql.Row(Distance = distance, Year = year, Month = month,
               DayofMonth = day_of_month, DayOfWeek = day_of_week,
               Airline = airline, Tail_Number = tail_number,
               Origin = origin, Dest = dest,
               OriginStateName = origin_state_name,
               DestStateName = dest_state_name)],
    schema = pysqltypes.StructType([
        pysqltypes.StructField("Distance", pysqltypes.FloatType(), True),
        pysqltypes.StructField("Year", pysqltypes.IntegerType(), True),
        pysqltypes.StructField("Month", pysqltypes.IntegerType(), True),
        pysqltypes.StructField("DayofMonth", pysqltypes.IntegerType(), True),
        pysqltypes.StructField("DayOfWeek", pysqltypes.IntegerType(), True),
        pysqltypes.StructField("Airline", pysqltypes.StringType(), True),
        pysqltypes.StructField("Tail_Number", pysqltypes.StringType(), True),
        pysqltypes.StructField("Origin", pysqltypes.StringType(), True),
        pysqltypes.StructField("Dest", pysqltypes.StringType(), True),
        pysqltypes.StructField("OriginStateName", pysqltypes.StringType(), True),
        pysqltypes.StructField("DestStateName", pysqltypes.StringType(), True)
    ])
)

# Preprocesamiento

In [56]:
def tail_number_column(df):
    columna = "Tail_Number_Grupo"
    direccion = pathlib.Path.cwd()
    archivo = "df_tail_number.csv"
    
    path_df = pathlib.Path.joinpath(direccion, "Tareas",archivo).as_posix()
    
    df_paso = spark.read.csv(path=path_df, header=True)
    
    df = df.join(other = df_paso, on = "Tail_Number", how = "left")
    
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None") 
            | (df[columna] == "Otros"),0).otherwise(df[columna])
    )
    
    return df

In [57]:
def origin_column(df):
    columna = "Origin_Grupo"
    direccion = pathlib.Path.cwd()
    archivo = "df_origin.csv"
    
    path_df = pathlib.Path.joinpath(direccion, "Tareas", archivo).as_posix()
    
    df_paso = spark.read.csv(path=path_df, header=True)
    
    df = df.join(other = df_paso, on = "Origin", how = "left")
    
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None") 
            | (df[columna] == "Otros"),0).otherwise(df[columna])
    )
    
    return df

In [58]:
def dest_column(df):
    columna = "Dest_Grupo"
    direccion = pathlib.Path.cwd()
    archivo = "df_dest.csv"
    
    path_df = pathlib.Path.joinpath(direccion, "Tareas", archivo).as_posix()
    
    df_paso = spark.read.csv(path=path_df, header=True)
    
    df = df.join(other = df_paso, on = "Dest", how = "left")
    
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None") 
            | (df[columna] == "Otros"),0).otherwise(df[columna])
    )
    
    return df

In [59]:
def origin_state_column(df):
    columna = "OriginStateName_Grupo"
    direccion = pathlib.Path.cwd()
    archivo = "df_origin_state.csv"
    
    path_df = pathlib.Path.joinpath(direccion, "Tareas", archivo).as_posix()
    
    df_paso = spark.read.csv(path=path_df, header=True)
    
    df = df.join(other = df_paso, on = "OriginStateName", how = "left")
    
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None") 
            | (df[columna] == "Otros"),0).otherwise(df[columna])
    )
    
    return df

In [60]:
def dest_state_column(df):
    columna = "DestStateName_Grupo"
    direccion = pathlib.Path.cwd()
    archivo = "df_dest_state.csv"
    
    path_df = pathlib.Path.joinpath(direccion, "Tareas", archivo).as_posix()
    
    df_paso = spark.read.csv(path=path_df, header=True)
    
    df = df.join(other = df_paso, on = "DestStateName", how = "left")
    
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None") 
            | (df[columna] == "Otros"),0).otherwise(df[columna])
    )
    
    return df

In [61]:
df = df_input
for columna in ["Airline", "Tail_Number", "Origin", "Dest", "OriginStateName", "DestStateName"]:
    df = df.withColumn(columna, pysqlfun.when(
        df[columna].isNull() | (df[columna] == "None"),"Otros").otherwise(df[columna]))
    
df = tail_number_column(df)
df = origin_column(df)
df = dest_column(df)
df = origin_state_column(df)
df = dest_state_column(df)

df_seleccion = df.select(["Distance",
    "Year","Month","DayOfWeek","DayofMonth","Airline",
    "Tail_Number_Grupo","origin_Grupo","Dest_Grupo",
    "OriginStateName_Grupo","DestStateName_Grupo"])

In [62]:
direccion = pathlib.Path.cwd()
    
path = pathlib.Path.joinpath(direccion, "Tareas", "indexer").as_posix()

indexer = pyml.feature.StringIndexerModel.load(path)

In [63]:
df_indexado = indexer.transform(df_seleccion)

In [64]:
assembler = pyml.feature.VectorAssembler(
    inputCols=[
        "Distance",
        "Year",
        "Month",
        "DayofMonth",
        "DayOfWeek",
        "Airline_indexado",
        "Tail_Number_indexado",
        "Origin_indexado",
        "Dest_indexado",
        "OriginStateName_indexado",
        "DestStateName_indexado"
    ],
    outputCol="features"
)

df_with_features = assembler.transform(df_indexado)

In [65]:
direccion = pathlib.Path.cwd()
    
path = pathlib.Path.joinpath(direccion, "Tareas", "RandomForest_model").as_posix()

modelo = pyml.classification.RandomForestClassificationModel.load(path)

                                                                                

In [66]:
#prediccion
prediccion = modelo.transform(df_with_features)

In [69]:
df_result = prediccion.select(["probability","prediction"]).toPandas()

                                                                                

In [78]:
prediccion = df_result["prediction"].to_list()[0]
prediccion = int(prediccion)

probabilidad = df_result["probability"].to_list()
prediccion = probabilidad[0][prediccion]

In [80]:
resultado = "Cancelado" if prediccion == 1 else "Normal"

print(f"Proyección estado de tu vuelo: {resultado}, con una probabilidad de {100*prediccion:.2f}%")

Proyección estado de tu vuelo: Normal, con una probabilidad de 51.64%


In [81]:
spark.stop()