In [None]:
# importamos librerias de pyspark para realizar el preprocesado de los datos
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan, isnull, mean, round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.feature import IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# importamos funciones auxiliares 
from filter_datasets import *
from process_2008_data import *
from models import *


In [None]:
# se crea la sesion de spark
spark = SparkSession.builder.appName("proyecto").getOrCreate()


In [None]:
file_configs = [
    {"input": "airports.csv", "output": "filtered_airports.csv", "columns": ["iata"]},
    {"input": "carriers.csv", "output": "filtered_carriers.csv", "columns": ["Code"]},
    {
        "input": "plane-data.csv",
        "output": "filtered_plane_data.csv",
        "columns": ["tailnum"],
    },
]

# Process each file
for config in file_configs:
    filter_columns(config["input"], config["output"], config["columns"])

In [None]:
# Input and output file paths
input_2008_file = "2008.csv" 
input_plane_file = "plane-data.csv"
output_file = "processed_2008.csv"
#original_col = [Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay]
# Run the function
process_2008_data(input_2008_file, input_plane_file, output_file)

In [None]:
# EDA
# se carga el dataset
df = spark.read.csv("processed_2008.csv", header=True, inferSchema=True)
cols = (
    df.columns
)  # Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,TailNum,IssueDate,CRSElapsedTime,ArrDelay,DepDelay,Origin,Dest,Cancelled

target = "ArrDelay"
df.printSchema()

In [None]:
# comprueba si hay valores nulos
for col in cols:
    print(col, df.filter(df[col].isNull()).count())
    

In [None]:
# se entrenan los modelos
# se crea un vector con las columnas que se van a utilizar
features = df.columns
features.remove("ArrDelay")

# se convierte la variable target a numerica

df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("ArrDelay", when(col("ArrDelay") > 15, 1).otherwise(0))
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))

# se convierten las variables categoricas a numericas (tailnum, UniqueCarrier, Origin, Dest, )
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    for column in ["UniqueCarrier", "TailNum", "Origin", "Dest"]
]

pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)



In [None]:
# linear regression
model = estimate_lr(df, features, target)

In [None]:
# kmeans
k = 2
model = estimate_kmeans(df, features, k)