In [3]:
# importamos librerias de pyspark para realizar el preprocesado de los datos
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan, isnull, mean, round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.feature import IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# importamos funciones auxiliares
# from filter_datasets import *
# from process_2008_data import *
from models import *


In [4]:
# se crea la sesion de spark
spark = SparkSession.builder.appName("proyecto").getOrCreate()


In [None]:
file_configs = [
    {"input": "airports.csv", "output": "filtered_airports.csv", "columns": ["iata"]},
    {"input": "carriers.csv", "output": "filtered_carriers.csv", "columns": ["Code"]},
    {
        "input": "plane-data.csv",
        "output": "filtered_plane_data.csv",
        "columns": ["tailnum"],
    },
]

# Process each file
for config in file_configs:
    filter_columns(config["input"], config["output"], config["columns"])

In [None]:
# Input and output file paths
input_2008_file = "2008.csv"
input_plane_file = "plane-data.csv"
output_file = "processed_2008.csv"
#original_col = [Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay]
# Run the function
process_2008_data(input_2008_file, input_plane_file, output_file)

In [None]:
# EDA
# se carga el dataset
df = spark.read.csv("processed_2008.csv", header=True, inferSchema=True)
# Mostrar esquema de las columnas
df.printSchema()

# Mostrar los primeros registros
df.show(5)

# 1. Resumen estadístico de las columnas numéricas
numerical_cols = [
    "Month",
    "DayofMonth",
    "DayOfWeek",
    "DepTime",
    "CRSDepTime",
    "CRSArrTime",
    "CRSElapsedTime",
    "ArrDelay",
    "DepDelay",
]
df.select(numerical_cols).describe().show()

# 2. Inspección de valores nulos o faltantes
missing_data = df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
)
print("Cantidad de valores nulos por columna:")
missing_data.show()

# 3. Inspección de columnas categóricas
categorical_cols = ["UniqueCarrier", "TailNum", "Origin", "Dest"]
for col_name in categorical_cols:
    print(f"Distribución de valores únicos para la columna {col_name}:")
    df.groupBy(col_name).count().orderBy("count", ascending=False).show(5)

# 4. Inspección específica de la variable objetivo (ArrDelay)
print("Estadísticas descriptivas de la variable objetivo (ArrDelay):")
df.select("ArrDelay").describe().show()

# 5. Identificar correlaciones básicas (opcional, solo entre columnas numéricas)
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
vector_df = assembler.transform(df).select("features")
correlation_matrix = Correlation.corr(vector_df, "features").head()[0]
print("Matriz de correlación:")
print(correlation_matrix)

In [None]:
# comprueba si hay valores nulos
for col in cols:
    print(col, df.filter(df[col].isNull()).count())


In [14]:
# elimeinamos los nulos en la columna ArrDelay
df = df.filter(df["ArrDelay"].isNotNull())

In [None]:
# comprueba si hay valores nulos
for col in cols:
    print(col, df.filter(df[col].isNull()).count())

In [None]:
from pyspark.sql.functions import col, year, avg, when, lit
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler

current_year = 2008  # Ajustar según el año actual
data = data.withColumn(
    "PlaneAge",
    when(col("IssueDate").isNotNull(), current_year - year(col("IssueDate"))).otherwise(
        None
    ),
)
# Rellenar nulos en PlaneAge con la media
avg_age = data.select(avg("PlaneAge")).first()[0]
data = data.withColumn(
    "PlaneAge", when(col("PlaneAge").isNull(), avg_age).otherwise(col("PlaneAge"))
)

# Convertir columnas categóricas a índices numéricos
categorical_columns = ["UniqueCarrier", "Origin", "Dest"]
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_Index") for col in categorical_columns
]
for indexer in indexers:
    data = indexer.fit(data).transform(data)

# Eliminar columnas originales categóricas
data = data.drop(*categorical_columns)

# Normalizar los valores
feature_columns = [
    "Month",
    "DayofMonth",
    "DayOfWeek",
    "DepTime",
    "CRSDepTime",
    "CRSArrTime",
    "CRSElapsedTime",
    "DepDelay",
    "Cancelled",
    "PlaneAge",
    "UniqueCarrier_Index",
    "Origin_Index",
    "Dest_Index",
]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_assembled")
data = assembler.transform(data)

scaler = MinMaxScaler(inputCol="features_assembled", outputCol="features")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

# Seleccionar columnas finales (incluye la normalizada y la variable objetivo)
df = data.select("features", "ArrDelay")

# Mostrar algunas filas del conjunto preprocesado
df.show(truncate=False)

In [None]:
# Se crea un vector con las columnas que se van a utilizar
features = df.columns
features.remove("ArrDelay")
from pyspark.sql.functions import col
# Se convierte la variable target a numérica
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("ArrDelay", when(col("ArrDelay") > 15, 1).otherwise(0)) # 1 si el vuelo se retrasó más de 15 minutos, 0 si no
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))


In [None]:
print("Primeras filas del dataframe:")
df.show(5)

In [23]:
# # eliminamos las filas que hemos indexado
# df = df.drop(*["UniqueCarrier", "TailNum", "Origin", "Dest"])

In [None]:
# Dividir los datos en entrenamiento y prueba
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)



In [None]:
# Modelo 1: Regresión Lineal
lr = LinearRegression(featuresCol="features", labelCol="ArrDelay")
param_grid_lr = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.1, 0.5])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

best_lr_model, rmse_lr = train_and_evaluate(lr, param_grid_lr, train_data, test_data)



In [None]:
# Modelo 2: Bosques Aleatorios
rf = RandomForestRegressor(featuresCol="features", labelCol="ArrDelay")
param_grid_rf = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [50, 100])
    .addGrid(rf.maxDepth, [5, 10])
    .build()
)

best_rf_model, rmse_rf = train_and_evaluate(rf, param_grid_rf, train_data, test_data)



In [None]:
# Modelo 3: Gradient Boosted Trees
gbt = GBTRegressor(featuresCol="features", labelCol="ArrDelay")
param_grid_gbt = (
    ParamGridBuilder()
    .addGrid(gbt.maxIter, [10, 50])
    .addGrid(gbt.maxDepth, [5, 10])
    .build()
)

best_gbt_model, rmse_gbt = train_and_evaluate(
    gbt, param_grid_gbt, train_data, test_data
)



In [None]:
# Comparar modelos
results = [
    ("Linear Regression", rmse_lr),
    ("Random Forest", rmse_rf),
    ("Gradient Boosted Trees", rmse_gbt),
]
results_sorted = sorted(results, key=lambda x: x[1])  # Ordenar por menor RMSE

print("Model Comparison (RMSE):")
for model_name, rmse in results_sorted:
    print(f"{model_name}: {rmse:.3f}")

# Elegir el mejor modelo
best_model_name, best_rmse = results_sorted[0]
print(f"\nBest Model: {best_model_name} with RMSE = {best_rmse:.3f}")

In [None]:
# linear regression
# Uso de funciones del archivo models.py
from models import estimate_lr, estimate_kmeans
input_columns_lr = [col for col in df.columns if (col != "ArrDelay" and col != "IssueDate")]
print(input_columns_lr)
model = estimate_lr(df, input_columns_lr, target)

In [None]:
# kmeans
input_columns_kmeans = [col for col in df.columns if (col != "ArrDelay" and col != "IssueDate")]
k = 3
kmeans_model = estimate_kmeans(df, input_columns_kmeans, k)