In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark import SparkContext

from pyspark.sql import SparkSession

from pyspark import SparkConf

import pyspark.sql as pysql

from pyspark import StorageLevel

import pyspark.ml as pyml

In [2]:
import os
import sys

In [3]:
conf = SparkConf().setAppName("Tarea") \
    .setMaster("local[8]") \
    .set("spark.executor.cores","2") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory","4g")

In [4]:
spark : SparkSession = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 02:59:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
columnas = [
    'Airline', # Aerolinea
    'CRSDepTime', #Hora de salida programada
    'DepTime', #Hora de salida actual
    'DepDelay', #Diferencia en minutos entre CRSDepTime y DepTime, tiempos negativos significa salidas tempranas
    'CRSArrTime',
    'ArrTime', #Hora de llegada
    'ArrDelay', #Diferencia de minutos entre la llegada programada y la hora de llegada, tiempos negativos significa llegadas tempranas
    'ActualElapsedTime', #Tiempo de vuelo verdadero
    'CRSElapsedTime', #Tiempo de vuelo programado
    'Distance', #Distancia entre aeropuertos en millas
    'Year', #Año del vuelo
    'Month', #Mes del vuelo
    'DayofMonth', #Dia del mes del vuelo
    'DayOfWeek', #Dia de la semana del vuelo
    'Tail_Number', #Numero de la cola, código para identificar la aeronave
    'Flight_Number_Operating_Airline', #Numero del vuelo
    
    'Origin', #Aeropuerto de origen
    'OriginAirportID', 'OriginAirportSeqID', #codigos del aeropuerto de origen
    'OriginCityName', #Nombre de la ciudad del aeropuerto de origen
    'OriginStateName', #Nombre de estado del aeropuerto de origen
    
    'Dest', #Aeropuerto de destino
    'DestAirportID', 'DestAirportSeqID', #codigos del aeropuerto de destino
    'DestCityName', #Nombre de la ciudad del aeropuerto de destino
    'DestStateName', #Nombre de estado del aeropuerto de destino
    
    'Cancelled', #El vuelo fue cancelado, 1 = Sí
    
    'Diverted', #Si el vuelo fue desviado, 1 = Sí
]

In [6]:
import zipfile
import pathlib

direccion = pathlib.Path.cwd().parent.parent

path_datos_zip = pathlib.Path.joinpath(direccion, 'Datos', 'data.zip')

path_guardar = pathlib.Path.joinpath(direccion, 'Datos')

archivos_datos = [
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2022.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2021.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2020.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2019.csv').as_posix(),
    pathlib.Path.joinpath(direccion, 'Datos', 'Combined_Flights_2018.csv').as_posix()
]

archivos_datos_en_zip  = [
    'Combined_Flights_2022.csv',
    'Combined_Flights_2021.csv',
    'Combined_Flights_2020.csv',
    'Combined_Flights_2019.csv',
    'Combined_Flights_2018.csv'
]

In [7]:
with zipfile.ZipFile(path_datos_zip, mode="r") as archivo_zip:
    for archivo in archivos_datos_en_zip:
        
        if os.path.isfile(path=pathlib.Path.joinpath(path_guardar,archivo).as_posix()):
            continue
        
        archivo_zip.extract(archivo, path_guardar)

In [8]:
df = spark.read.csv(archivos_datos, header=True)
df = df.select(columnas)

In [9]:
df = df.persist(StorageLevel.DISK_ONLY)

24/10/31 02:59:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
#schema del dataframe
df = df.withColumn("CRSDepTime", df["CRSDepTime"].cast(pysql.types.FloatType()))
df = df.withColumn("DepTime", df["DepTime"].cast(pysql.types.FloatType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(pysql.types.FloatType()))
df = df.withColumn("ArrTime", df["ArrTime"].cast(pysql.types.FloatType()))
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(pysql.types.FloatType()))
df = df.withColumn("ActualElapsedTime", df["ActualElapsedTime"].cast(pysql.types.FloatType()))
df = df.withColumn("CRSElapsedTime", df["CRSElapsedTime"].cast(pysql.types.FloatType()))
df = df.withColumn("Distance", df["Distance"].cast(pysql.types.FloatType()))
df = df.withColumn("Year", df["Year"].cast(pysql.types.IntegerType()))
df = df.withColumn("Month", df["Month"].cast(pysql.types.IntegerType()))
df = df.withColumn("DayofMonth", df["DayofMonth"].cast(pysql.types.IntegerType()))
df = df.withColumn("DayOfWeek", df["DayOfWeek"].cast(pysql.types.IntegerType()))


In [11]:
df = df.withColumn("Cancelled", pysql.functions.when(df["Cancelled"] == "False",0).otherwise(1))
df = df.withColumn("Diverted", pysql.functions.when(df["Diverted"] == "False",0).otherwise(1))

In [12]:
df.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- CRSDepTime: float (nullable = true)
 |-- DepTime: float (nullable = true)
 |-- DepDelay: float (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- ArrTime: float (nullable = true)
 |-- ArrDelay: float (nullable = true)
 |-- ActualElapsedTime: float (nullable = true)
 |-- CRSElapsedTime: float (nullable = true)
 |-- Distance: float (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Tail_Number: string (nullable = true)
 |-- Flight_Number_Operating_Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- OriginAirportID: string (nullable = true)
 |-- OriginAirportSeqID: string (nullable = true)
 |-- OriginCityName: string (nullable = true)
 |-- OriginStateName: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- DestAirportID: string (nullable = true)
 |-- DestAir

In [13]:
df.select("Airline").distinct().count() #cantidad de aerolineas

24/10/31 02:59:22 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

28

seleccionar el top 31 de aeropuertos para random forest

In [14]:
df.select("OriginAirportID").distinct().count() #cantidad de aeropuerto de origen

                                                                                

388

In [15]:
df_OriginAirpotId_count = df.groupby("OriginAirportID"). \
    count().sort("count", ascending = False)

In [16]:
df_OriginAirpotId_count.limit(31).select(pysql.functions.sum("count")).show()



+----------+
|sum(count)|
+----------+
|  18717110|
+----------+



                                                                                

In [17]:
df.count()

                                                                                

29193782

In [18]:
18717110 / 29193782

0.6411334441012131

In [19]:
df_OriginAirpotId_count_top31 = df_OriginAirpotId_count.select("OriginAirportID").limit(31)

In [20]:
df_OriginAirpotId_count_top31.distinct().count()

                                                                                

31

In [21]:
df_OriginAirpotId_count_top31 = df_OriginAirpotId_count_top31.withColumn(
    "OriginAirportID_esTop", pysql.functions.lit(1)
)

In [22]:
df_OriginAirpotId_count_top31.show()



+---------------+---------------------+
|OriginAirportID|OriginAirportID_esTop|
+---------------+---------------------+
|          13930|                    1|
|          10397|                    1|
|          11292|                    1|
|          11298|                    1|
|          11057|                    1|
|          12892|                    1|
|          14747|                    1|
|          12266|                    1|
|          14107|                    1|
|          12889|                    1|
|          14771|                    1|
|          11433|                    1|
|          11618|                    1|
|          13204|                    1|
|          13487|                    1|
|          12953|                    1|
|          10721|                    1|
|          14100|                    1|
|          11278|                    1|
|          14869|                    1|
+---------------+---------------------+
only showing top 20 rows



                                                                                

In [23]:
df = df.join(other=df_OriginAirpotId_count_top31,
        on="OriginAirportID",
        how="left") #.show()

In [24]:
df.select("DestAirportID").distinct().count() #cantidad de aeropuerto de destino

                                                                                

388

In [25]:
df_DestAirportID_count = df.groupby("DestAirportID"). \
    count().sort("count", ascending = False)
    
df_DestAirportID_count.limit(31).select(pysql.functions.sum("count")).show()



+----------+
|sum(count)|
+----------+
|  18716733|
+----------+



                                                                                

In [26]:
18716733/ 29193782

0.6411205303923966

In [27]:
df_DestAirportID_count_top31 = df_DestAirportID_count.select("DestAirportID").limit(31)

df_DestAirportID_count_top31 = df_DestAirportID_count_top31.withColumn(
    "DestAirportID_esTop", pysql.functions.lit(1)
)

In [28]:
df = df.join(other=df_DestAirportID_count_top31,
        on="DestAirportID",
        how="left") #.show()

Eliminar aeropuertos afuera del top 31

In [29]:

df = df.withColumn("OriginAirportID",
    pysql.functions.when(df["OriginAirportID_esTop"] == 1, df["OriginAirportID"]).otherwise("other"))

df = df.withColumn("DestAirportID",
    pysql.functions.when(df["DestAirportID_esTop"] == 1, df["DestAirportID"]).otherwise("other"))

In [30]:
df.select("OriginAirportID").distinct().count() #cantidad de aeropuerto de origen

                                                                                

32

In [31]:
indexer_airline = pyml.feature.StringIndexer(
    inputCols=["Airline","OriginAirportID","DestAirportID"],
    outputCols=["Airline_indexado","OriginAirportID_indexado","DestAirportID_indexado"])

In [32]:
df_indexado = indexer_airline.fit(df).transform(df)

                                                                                

In [33]:
encoder_airline = pyml.feature.OneHotEncoder(
    inputCols=["Airline_indexado","OriginAirportID_indexado","DestAirportID_indexado"],
    outputCols=["Airline_OneHotEncoding","OriginAirportID_OneHotEncoding","DestAirportID_OneHotEncoding"])

In [34]:
df_encoder = encoder_airline.fit(df_indexado).transform(df_indexado)

In [35]:
df_encoder.select("Airline_OneHotEncoding").show()

+----------------------+
|Airline_OneHotEncoding|
+----------------------+
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
|       (27,[21],[1.0])|
+----------------------+
only showing top 20 rows



In [36]:
assembler = pyml.feature.VectorAssembler(
    inputCols=[
        "Distance",
        "Airline_indexado", #"Airline_OneHotEncoding",
        "OriginAirportID_indexado", #"OriginAirportID_OneHotEncoding",
        "DestAirportID_indexado", #,"DestAirportID_OneHotEncoding"
        "Year",
        "Month",
        "DayofMonth",
        "DayOfWeek"
    ],
    outputCol="features"
)

In [37]:
df_with_features = assembler.transform(df_encoder)

In [38]:
df_with_features.select([
    "Distance",
    "Airline_indexado", #"Airline_OneHotEncoding",
    "OriginAirportID_indexado", #"OriginAirportID_OneHotEncoding",
    "DestAirportID_indexado", #,"DestAirportID_OneHotEncoding",
        "Year",
        "Month",
        "DayofMonth",
        "DayOfWeek",
    "features"]).show(10, truncate=False)

                                                                                

+--------+----------------+------------------------+----------------------+----+-----+----------+---------+---------------------------------------+
|Distance|Airline_indexado|OriginAirportID_indexado|DestAirportID_indexado|Year|Month|DayofMonth|DayOfWeek|features                               |
+--------+----------------+------------------------+----------------------+----+-----+----------+---------+---------------------------------------+
|212.0   |21.0            |0.0                     |3.0                   |2022|4    |4         |1        |[212.0,21.0,0.0,3.0,2022.0,4.0,4.0,1.0]|
|295.0   |21.0            |0.0                     |8.0                   |2022|4    |4         |1        |[295.0,21.0,0.0,8.0,2022.0,4.0,4.0,1.0]|
|251.0   |21.0            |0.0                     |3.0                   |2022|4    |4         |1        |[251.0,21.0,0.0,3.0,2022.0,4.0,4.0,1.0]|
|376.0   |21.0            |8.0                     |0.0                   |2022|4    |4         |1        |[376.

In [39]:
train, test = df_with_features.randomSplit([0.8,0.2], seed = 42)

In [40]:
train.select("Cancelled").groupby("Cancelled").count().show()

24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/10/31 03:05:33 WARN RowBasedKeyValueBatch: Calling spill() on

+---------+--------+
|Cancelled|   count|
+---------+--------+
|        1|  621441|
|        0|22730423|
+---------+--------+



                                                                                

In [41]:
train = train.persist(StorageLevel.DISK_ONLY)

In [42]:
test = test.persist(StorageLevel.DISK_ONLY)

In [43]:
train_cancelled_1 = train.filter(train["Cancelled"] == 1)
train_cancelled_0 = train.filter(train["Cancelled"] == 0)

In [44]:
oversampled_train_Cancelled_1 = train_cancelled_1.sample(
    withReplacement = True,
    fraction = 10.0,
    seed = 123
)

balanced_df = train_cancelled_0.union(oversampled_train_Cancelled_1)

balanced_df = balanced_df.persist(StorageLevel.DISK_ONLY)

In [45]:
ml_class = pyml.classification.RandomForestClassifier(
    featuresCol="features",
    labelCol="Cancelled"
)

In [46]:
model = ml_class.fit(balanced_df)

24/10/31 03:14:57 WARN MemoryStore: Not enough space to cache rdd_271_39 in memory! (computed 12.5 MiB so far)
24/10/31 03:14:57 WARN MemoryStore: Not enough space to cache rdd_271_33 in memory! (computed 44.4 MiB so far)
24/10/31 03:14:58 WARN BlockManager: Persisting block rdd_271_33 to disk instead.
24/10/31 03:14:58 WARN BlockManager: Persisting block rdd_271_39 to disk instead.
24/10/31 03:14:58 WARN MemoryStore: Not enough space to cache rdd_271_37 in memory! (computed 19.2 MiB so far)
24/10/31 03:14:58 WARN BlockManager: Persisting block rdd_271_37 to disk instead.
24/10/31 03:14:58 WARN MemoryStore: Not enough space to cache rdd_271_36 in memory! (computed 44.4 MiB so far)
24/10/31 03:14:58 WARN BlockManager: Persisting block rdd_271_36 to disk instead.
24/10/31 03:14:58 WARN MemoryStore: Not enough space to cache rdd_271_38 in memory! (computed 29.6 MiB so far)
24/10/31 03:14:58 WARN BlockManager: Persisting block rdd_271_38 to disk instead.
24/10/31 03:15:01 WARN MemoryStore:

In [47]:
#evaluar modelo
prediction = model.transform(test)


In [48]:
prediction.show(10, truncate = False)



+-------------+---------------+--------------------+----------+-------+--------+----------+-------+--------+-----------------+--------------+--------+----+-----+----------+---------+-----------+-------------------------------+------+------------------+--------------+---------------+----+----------------+------------+-------------+---------+--------+---------------------+-------------------+----------------+------------------------+----------------------+----------------------+------------------------------+----------------------------+----------------------------------------+--------------------------------------+----------------------------------------+----------+
|DestAirportID|OriginAirportID|Airline             |CRSDepTime|DepTime|DepDelay|CRSArrTime|ArrTime|ArrDelay|ActualElapsedTime|CRSElapsedTime|Distance|Year|Month|DayofMonth|DayOfWeek|Tail_Number|Flight_Number_Operating_Airline|Origin|OriginAirportSeqID|OriginCityName|OriginStateName|Dest|DestAirportSeqID|DestCityName|DestStat

                                                                                

In [49]:
evaluator = pyml.evaluation.BinaryClassificationEvaluator(
    labelCol="Cancelled",
    rawPredictionCol = "prediction",
    metricName="areaUnderROC"
)

In [50]:
AUC = evaluator.evaluate(prediction)

                                                                                

In [51]:
AUC

0.5759808712475746

Posible motivo:
- Se necesita un mejor balanceo de clases, cancelados son el ~1% de los datos
- Se ocupa utilizar One Hot Encoding

In [55]:
spark.stop()