In [0]:
pip install kagglehub

In [0]:
%restart_python

In [0]:
# First we import the liberires
from pyspark.sql.functions import col, when
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import mlflow
import mlflow.spark
import kagglehub

In [0]:
path = kagglehub.dataset_download("giovamata/airlinedelaycauses")

df = spark.read.csv("path")

In [0]:
df.printSchema()
df.show()

In [0]:
from pyspark.sql.functions import col, when

df = df.withColumn(
    "Is_Delayed", 
    when(col("ArrDelayMinutes") > 15, 1).otherwise(0)
)
df.select("ArrDelayMinutes", "Is_Delayed").show(5)

In [0]:
features = df.select("Origin", "Dest", "Carrier", "Distance", "Is_Delayed").na.drop()

In [0]:
indexer1 = StringIndexer(inputCol="Carrier", outputCol="Carrier_Index")
indexer2 = StringIndexer(inputCol="Origin", outputCol="Origin_Index")
indexer3 = StringIndexer(inputCol="Dest", outputCol="Dest_Index")

In [0]:
assembler = VectorAssembler(inputCols=["Carrier_Index", "Orgin_Index", "Dest_Index", "Distance"], outputCol="features")

In [0]:
rf = RandomForestClassifier(labelCol="Is_Delayed", featuresCol="features")

In [0]:
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, assembler, rf])

In [0]:
train, test = features.randomSplit([0.8, 0.2], seed=42)

In [0]:
with mlflow.start_run():
    model = pipeline.fit(train)
    predictions = model.transform(test)
    evaluator = BinaryClassificationEvaluator(labelCol="Is_Delayed")
    auc = evaluator.evaluate(predictions)
    mlflow.log_metric("AUC", auc)
    mlflow.spark.log_model(model, "flight_rf_model")
    print(f"AUC: {auc}")

In [0]:
predictions.select("Carrier", "Origin", "Dest", "Distance", "Is_Delayed", "predictions")