In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("CSV Reader").getOrCreate()

# Read the CSV file
csv_file_path = "Airline_Delay_2016-2018.csv"  # Replace with your CSV file path
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the first few rows of the dataframe
df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/27 23:10:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/27 23:10:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+--------+----------------+-------------------+--------+--------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+--------+----------------+-------------------+--------+--------+
|2016-01-01|        DL|             1248|   DTW| LAX|        1935|  1935.0|      0.0|    23.0|    1958.0|   2107.0|   13.0|        2144|  2120.0|    -24.0|      0.0|     0.0|           309.0|              285.0|   249.0|  1979.0|
|2016-01-01|        DL|             1251|   ATL| GRR|        2125|  2130.0|     

In [None]:
from pyspark.sql.functions import col, sum as spark_sum, round as spark_round

# =====================
# PARTE 1: Valores nulos
# =====================

# 1. Número total de registros
total_rows = df.count()

# 2. Obtener tipos de datos
data_types = dict(df.dtypes)

# 3. Contar valores nulos por columna
missing_exprs = [
    spark_sum(col(c).isNull().cast("int")).alias(c) 
    for c in df.columns
]

missing_df = df.select(missing_exprs)
missing_counts = missing_df.collect()[0].asDict()

# 4. Crear un DataFrame resumen de valores nulos
nulls_data = []
for col_name in df.columns:
    n_missing = missing_counts[col_name]
    pct_missing = (n_missing / total_rows) * 100
    dtype = data_types[col_name]
    nulls_data.append((col_name, dtype, n_missing, round(pct_missing, 2)))

nulls_schema = ["columna", "tipo_de_dato", "valores_nulos", "porcentaje_nulos"]
nulls_df = spark.createDataFrame(nulls_data, schema=nulls_schema)

# Mostrar resultados de valores nulos
print("==== Análisis de valores nulos ====")
nulls_df.show(n=nulls_df.count(), truncate=False)

# ==============================
# PARTE 2: Análisis Estadístico
# ==============================

# 1. Seleccionar columnas numéricas
numeric_columns = [name for name, dtype in df.dtypes if dtype in ('int', 'double', 'float', 'bigint', 'smallint')]

if numeric_columns:
    # 2. Aplicar describe() solo a numéricas
    describe_df = df.select([col(c) for c in numeric_columns]).describe()

    # 3. Transformar describe_df a formato vertical
    metric_names = describe_df.select("summary").rdd.flatMap(lambda x: x).collect()

    stats_data = []
    for col_name in numeric_columns:
        values = describe_df.select(col_name).rdd.flatMap(lambda x: x).collect()
        metric_dict = dict(zip(metric_names, values))
        stats_row = (
            col_name,
            float(metric_dict.get("count", 0)),
            float(metric_dict.get("mean", 0)),
            float(metric_dict.get("stddev", 0)),
            float(metric_dict.get("min", 0)),
            float(metric_dict.get("max", 0))
        )
        stats_data.append(stats_row)

    stats_schema = ["columna", "count", "mean", "stddev", "min", "max"]
    stats_df = spark.createDataFrame(stats_data, schema=stats_schema)

    # Mostrar resultados estadísticos
    print("\n==== Análisis estadístico de columnas numéricas ====")
    nulls_df.show(n=nulls_df.count(), truncate=False)
else:
    print("\nNo hay columnas numéricas para análisis estadístico.")



                                                                                

==== Análisis de valores nulos ====


                                                                                

+-------------------+------------+-------------+----------------+
|columna            |tipo_de_dato|valores_nulos|porcentaje_nulos|
+-------------------+------------+-------------+----------------+
|FL_DATE            |date        |0            |0.0             |
|OP_CARRIER         |string      |0            |0.0             |
|OP_CARRIER_FL_NUM  |int         |0            |0.0             |
|ORIGIN             |string      |0            |0.0             |
|DEST               |string      |0            |0.0             |
|CRS_DEP_TIME       |int         |0            |0.0             |
|DEP_TIME           |double      |256081       |1.38            |
|DEP_DELAY          |double      |261033       |1.41            |
|TAXI_OUT           |double      |263393       |1.42            |
|WHEELS_OFF         |double      |263388       |1.42            |
|WHEELS_ON          |double      |271764       |1.47            |
|TAXI_IN            |double      |271764       |1.47            |
|CRS_ARR_T

                                                                                


==== Análisis estadístico de columnas numéricas ====


                                                                                

+-------------------+-----------+---------------------+--------------------+------+------+
|columna            |count      |mean                 |stddev              |min   |max   |
+-------------------+-----------+---------------------+--------------------+------+------+
|OP_CARRIER_FL_NUM  |1.8505725E7|2304.635922451025    |1792.0299765876928  |1.0   |8402.0|
|CRS_DEP_TIME       |1.8505725E7|1330.1731464722404   |490.5252390971099   |1.0   |2359.0|
|DEP_TIME           |1.8249644E7|1333.743058385139    |503.8671257420324   |1.0   |2400.0|
|DEP_DELAY          |1.8244692E7|9.580880017048246    |43.04881783465609   |-234.0|2755.0|
|TAXI_OUT           |1.8242332E7|16.84608404232529    |9.441980050787464   |0.0   |196.0 |
|WHEELS_OFF         |1.8242337E7|1356.3663577753223   |505.54272649540945  |1.0   |2400.0|
|WHEELS_ON          |1.8233961E7|1464.4559408677028   |532.6591688699245   |1.0   |2400.0|
|TAXI_IN            |1.8233961E7|7.527830952364108    |5.90965625624773    |0.0   |414.0 |