In [107]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [108]:
# Cargar el archivo CSV con la primera fila como encabezado
df = spark.read.format("csv").option("delimiter", ",").option("header", "true").load("1987.csv")

# Mostrar las columnas y las primeras 15 filas
df.show(15, truncate=False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|1987|10   |14        |3        |741    |730       |912    |849       |PS           |1451     

In [109]:
from pyspark.sql.functions import col, when

# Iterate over all columns in the DataFrame
for column in df.columns:
    df = df.withColumn(column, when(col(column) == "NA", None).otherwise(col(column)))

df.show()

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|1987|   10|        14|        3|    741|       730|    912|       849|           PS|     1451

In [68]:
from pyspark.sql.functions import col

# Contar valores NA por columna
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance| TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|   0|    0|         0|        0|  19685|         0|  23500|         0|            0|      

In [52]:
# Mostrar las columnas del DataFrame
print(df.columns)

['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']


In [53]:
df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

# Preprocessing
- eliminate unnecesary variables
- missing and duplicates values
- see correlation
- variable transformation
- variable creation

In [110]:
# columns to eliminate
columns = [
    "ArrTime", 
    "ActualElapsedTime", 
    "AirTime", 
    "TaxiIn", 
    "Diverted", 
    "CarrierDelay", 
    "WeatherDelay", 
    "NASDelay", 
    "SecurityDelay", 
    "LateAircraftDelay"
]

# Eliminate columns
df = df.drop(*columns)


In [70]:
df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)



In [111]:
# columns to eliminate
columns = [
    "Year",
    "TailNum",
    "TaxiOut",
    "Cancelled",
    "CancellationCode"  
]

# Eliminate columns
df = df.drop(*columns)

In [98]:
df.printSchema()

root
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)



## Missing values

In [112]:
# Contar valores NA por columna
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|    0|         0|        0|  19685|         0|         0|            0|        0|             0|   23500|   19685|     0|   0|    1015|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+



In [74]:
df.select(col("ArrDelay")).distinct().show()

+--------+
|ArrDelay|
+--------+
|      -4|
|     125|
|     -30|
|   -1219|
|       7|
|      51|
|     124|
|     169|
|     205|
|      -1|
|      15|
|      54|
|     155|
|     154|
|     132|
|     317|
|     200|
|     428|
|      11|
|      -6|
+--------+
only showing top 20 rows



In [113]:
# Total number of rows in the DataFrame
total_rows = df.count()

# Calculate the percentage of null values for each column
null_percentage = df.select([(count(when(col(c).isNull(), c)) / total_rows).alias(c) for c in df.columns])

# Show the percentage of null values for each column
null_percentage.show()

+-----+----------+---------+--------------------+----------+----------+-------------+---------+--------------+--------------------+--------------------+------+----+--------------------+
|Month|DayofMonth|DayOfWeek|             DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|            ArrDelay|            DepDelay|Origin|Dest|            Distance|
+-----+----------+---------+--------------------+----------+----------+-------------+---------+--------------+--------------------+--------------------+------+----+--------------------+
|  0.0|       0.0|      0.0|0.015005801074227831|       0.0|       0.0|          0.0|      0.0|           0.0|0.017913961150335486|0.015005801074227831|   0.0| 0.0|7.737306624506603E-4|
+-----+----------+---------+--------------------+----------+----------+-------------+---------+--------------+--------------------+--------------------+------+----+--------------------+



In [114]:
# Eliminar filas con al menos un valor faltante
df = df.dropna()

# Verificar que las filas con valores faltantes se han eliminado
df.count()

1287333

In [115]:
# Total number of rows in the DataFrame
total_rows = df.count()

# Calculate the percentage of null values for each column
null_percentage = df.select([(count(when(col(c).isNull(), c)) / total_rows).alias(c) for c in df.columns])

# Show the percentage of null values for each column
null_percentage.show()

+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|  0.0|       0.0|      0.0|    0.0|       0.0|       0.0|          0.0|      0.0|           0.0|     0.0|     0.0|   0.0| 0.0|     0.0|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+



## Duplicates

In [116]:
# Check for duplicates and show the results
print(df.count())
df = df.dropDuplicates()
print(df.count())
if df.count() > 0:
    print("There are duplicates in the DataFrame.")
    df.show()
else:
    print("No duplicates found in the DataFrame.")

1287333
1287264
There are duplicates in the DataFrame.
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+
|   10|        10|        6|   1635|      1635|      1732|           PS|     1454|            57|       8|       0|   OAK| BUR|     325|
|   10|        20|        2|   1945|      1945|      2101|           PS|     1484|            76|       1|       0|   SFO| SAN|     447|
|   10|         9|        5|   2057|      2016|      2132|           PS|     1486|            76|      47|      41|   OAK| SAN|     446|
|   10|        13|        2|   2114|      2115|      2223|           PS|     1505|            68|       3|      -1|   SNA| 

## Correlation

## Variable transformation

## Variable creation

# Modeling

# Validation