In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark import SparkContext

from pyspark.sql import SparkSession

from pyspark import SparkConf

import pyspark.sql as pysql

from pyspark import StorageLevel

import pyspark.ml as pyml

In [2]:
import os
import sys

In [3]:
conf = SparkConf().setAppName("Tarea") \
    .setMaster("local[8]") \
    .set("spark.executor.cores","2") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory","4g")

In [4]:
spark : SparkSession = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/28 01:05:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
columnas = [
    'Airline', # Aerolinea
    'CRSDepTime', #Hora de salida programada
    'DepTime', #Hora de salida actual
    'DepDelay', #Diferencia en minutos entre CRSDepTime y DepTime, tiempos negativos significa salidas tempranas
    'CRSArrTime',
    'ArrTime', #Hora de llegada
    'ArrDelay', #Diferencia de minutos entre la llegada programada y la hora de llegada, tiempos negativos significa llegadas tempranas
    'ActualElapsedTime', #Tiempo de vuelo verdadero
    'CRSElapsedTime', #Tiempo de vuelo programado
    'Distance', #Distancia entre aeropuertos en millas
    'Year', #Año del vuelo
    'Month', #Mes del vuelo
    'DayofMonth', #Dia del mes del vuelo
    'DayOfWeek', #Dia de la semana del vuelo
    'Tail_Number', #Numero de la cola, código para identificar la aeronave
    'Flight_Number_Operating_Airline', #Numero del vuelo
    
    'Origin', #Aeropuerto de origen
    'OriginAirportID', 'OriginAirportSeqID', #codigos del aeropuerto de origen
    'OriginCityName', #Nombre de la ciudad del aeropuerto de origen
    'OriginStateName', #Nombre de estado del aeropuerto de origen
    
    'Dest', #Aeropuerto de destino
    'DestAirportID', 'DestAirportSeqID', #codigos del aeropuerto de destino
    'DestCityName', #Nombre de la ciudad del aeropuerto de destino
    'DestStateName', #Nombre de estado del aeropuerto de destino
    
    'Cancelled', #El vuelo fue cancelado, 1 = Sí
    
    'Diverted', #Si el vuelo fue desviado, 1 = Sí
]

In [6]:
import zipfile
import pathlib

direccion = pathlib.Path.cwd().parent.parent

path_datos_zip = pathlib.Path.joinpath(direccion, 'Datos', 'data.zip')

path_guardar = pathlib.Path.joinpath(direccion, 'Datos')

archivos_datos = [
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2022.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2021.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2020.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2019.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2018.csv').as_posix()
]

archivos_datos_en_zip  = [
    'Combined_Flights_2022.csv',
    'Combined_Flights_2021.csv',
    'Combined_Flights_2020.csv',
    'Combined_Flights_2019.csv',
    'Combined_Flights_2018.csv'
]

In [7]:
with zipfile.ZipFile(path_datos_zip, mode="r") as archivo_zip:
    for archivo in archivos_datos_en_zip:
        
        if os.path.isfile(path=pathlib.Path.joinpath(path_guardar,archivo).as_posix()):
            continue
        
        archivo_zip.extract(archivo, path_guardar)

In [8]:
df = spark.read.option("header","true").csv(archivos_datos)
df = df.select(columnas)

                                                                                

In [9]:
df = df.persist(StorageLevel.DISK_ONLY)

24/10/28 01:05:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
#schema del dataframe
df = df.withColumn("CRSDepTime", df["CRSDepTime"].cast(pysql.types.FloatType()))
df = df.withColumn("DepTime", df["DepTime"].cast(pysql.types.FloatType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(pysql.types.FloatType()))
df = df.withColumn("ArrTime", df["ArrTime"].cast(pysql.types.FloatType()))
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(pysql.types.FloatType()))
df = df.withColumn("ActualElapsedTime", df["ActualElapsedTime"].cast(pysql.types.FloatType()))
df = df.withColumn("CRSElapsedTime", df["CRSElapsedTime"].cast(pysql.types.FloatType()))
df = df.withColumn("Distance", df["Distance"].cast(pysql.types.FloatType()))
df = df.withColumn("Year", df["Year"].cast(pysql.types.IntegerType()))
df = df.withColumn("Month", df["Month"].cast(pysql.types.IntegerType()))
df = df.withColumn("DayofMonth", df["DayofMonth"].cast(pysql.types.IntegerType()))
df = df.withColumn("DayOfWeek", df["DayOfWeek"].cast(pysql.types.IntegerType()))

df = df.withColumn("Cancelled", pysql.functions.when(df["Cancelled"] == "False",0).otherwise(1))
df = df.withColumn("Diverted", pysql.functions.when(df["Diverted"] == "False",0).otherwise(1))

In [11]:
df.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- CRSDepTime: float (nullable = true)
 |-- DepTime: float (nullable = true)
 |-- DepDelay: float (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- ArrTime: float (nullable = true)
 |-- ArrDelay: float (nullable = true)
 |-- ActualElapsedTime: float (nullable = true)
 |-- CRSElapsedTime: float (nullable = true)
 |-- Distance: float (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Tail_Number: string (nullable = true)
 |-- Flight_Number_Operating_Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- OriginAirportID: string (nullable = true)
 |-- OriginAirportSeqID: string (nullable = true)
 |-- OriginCityName: string (nullable = true)
 |-- OriginStateName: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- DestAirportID: string (nullable = true)
 |-- DestAir

In [12]:
df.select("Airline").distinct().count() #cantidad de aerolineas

24/10/28 01:05:46 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

28

In [13]:
indexer_airline = pyml.feature.StringIndexer(
    inputCols=["Airline","OriginAirportID","DestAirportID"],
    outputCols=["Airline_indexado","OriginAirportID_indexado","DestAirportID_indexado"])

In [14]:
df_indexado = indexer_airline.fit(df).transform(df)

                                                                                

In [15]:
encoder_airline = pyml.feature.OneHotEncoder(
    inputCols=["Airline_indexado","OriginAirportID_indexado","DestAirportID_indexado"],
    outputCols=["Airline_OneHotEncoding","OriginAirportID_OneHotEncoding","DestAirportID_OneHotEncoding"])

In [16]:
df_encoder = encoder_airline.fit(df_indexado).transform(df_indexado)

In [17]:
df_encoder.show()

+--------------------+----------+-------+--------+----------+-------+--------+-----------------+--------------+--------+----+-----+----------+---------+-----------+-------------------------------+------+---------------+------------------+--------------------+---------------+----+-------------+----------------+-------------------+-------------+---------+--------+----------------+------------------------+----------------------+----------------------+------------------------------+----------------------------+
|             Airline|CRSDepTime|DepTime|DepDelay|CRSArrTime|ArrTime|ArrDelay|ActualElapsedTime|CRSElapsedTime|Distance|Year|Month|DayofMonth|DayOfWeek|Tail_Number|Flight_Number_Operating_Airline|Origin|OriginAirportID|OriginAirportSeqID|      OriginCityName|OriginStateName|Dest|DestAirportID|DestAirportSeqID|       DestCityName|DestStateName|Cancelled|Diverted|Airline_indexado|OriginAirportID_indexado|DestAirportID_indexado|Airline_OneHotEncoding|OriginAirportID_OneHotEncoding|Dest

In [34]:
assembler = pyml.feature.VectorAssembler(
    inputCols=[
        "Distance",
        "Airline_indexado", #"Airline_OneHotEncoding",
        "OriginAirportID_indexado", #"OriginAirportID_OneHotEncoding",
        "DestAirportID_indexado" #,"DestAirportID_OneHotEncoding"
    ],
    outputCol="features"
)

In [35]:
df_with_features = assembler.transform(df_encoder)

In [36]:
df_with_features.select([
    "Distance",
    "Airline_indexado", #"Airline_OneHotEncoding",
    "OriginAirportID_indexado", #"OriginAirportID_OneHotEncoding",
    "DestAirportID_indexado", #,"DestAirportID_OneHotEncoding",
    "features"]).show(10, truncate=False)

                                                                                

+--------+----------------+------------------------+----------------------+----------------------+
|Distance|Airline_indexado|OriginAirportID_indexado|DestAirportID_indexado|features              |
+--------+----------------+------------------------+----------------------+----------------------+
|212.0   |21.0            |161.0                   |2.0                   |[212.0,21.0,161.0,2.0]|
|295.0   |21.0            |165.0                   |7.0                   |[295.0,21.0,165.0,7.0]|
|251.0   |21.0            |171.0                   |2.0                   |[251.0,21.0,171.0,2.0]|
|376.0   |21.0            |7.0                     |167.0                 |[376.0,21.0,7.0,167.0]|
|251.0   |21.0            |171.0                   |2.0                   |[251.0,21.0,171.0,2.0]|
|541.0   |21.0            |2.0                     |77.0                  |[541.0,21.0,2.0,77.0] |
|127.0   |21.0            |7.0                     |223.0                 |[127.0,21.0,7.0,223.0]|
|771.0   |

In [20]:
lr = pyml.classification.LogisticRegression(
    featuresCol="features",
    labelCol="Cancelled"
)

In [38]:
train, test = df_with_features.randomSplit([0.8,0.2])

In [41]:
model = lr.fit(train)

                                                                                

In [44]:
#evaluar modelo
prediction = model.transform(test)


In [45]:
prediction.show(10, truncate = False)

[Stage 152:>                                                        (0 + 1) / 1]

+---------------------------+----------+-------+--------+----------+-------+--------+-----------------+--------------+--------+----+-----+----------+---------+-----------+-------------------------------+------+---------------+------------------+------------------------------+---------------+----+-------------+----------------+--------------+-------------+---------+--------+----------------+------------------------+----------------------+----------------------+------------------------------+----------------------------+-----------------------+----------------------------------------+----------------------------------------+----------+
|Airline                    |CRSDepTime|DepTime|DepDelay|CRSArrTime|ArrTime|ArrDelay|ActualElapsedTime|CRSElapsedTime|Distance|Year|Month|DayofMonth|DayOfWeek|Tail_Number|Flight_Number_Operating_Airline|Origin|OriginAirportID|OriginAirportSeqID|OriginCityName                |OriginStateName|Dest|DestAirportID|DestAirportSeqID|DestCityName  |DestStateName|C

                                                                                

In [46]:
evaluator = pyml.evaluation.BinaryClassificationEvaluator(
    labelCol="Cancelled",
    rawPredictionCol = "prediction",
    metricName="areaUnderROC"
)

In [47]:
AUC = evaluator.evaluate(prediction)

                                                                                

In [49]:
AUC

0.5

In [13]:
spark.stop()