Train the model

In [2]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.sql.functions import array, col, lit

In [3]:
dfDelays = spark.sql("select OriginAirportCode, cast(Month as int) Month, cast(DayofMonth as int) DayofMonth, CRSDepHour, cast(DayOfWeek as int) DayOfWeek, Carrier, DestAirportCode, DepDel15, WindSpeed, SeaLevelPressure, HourlyPrecip from flight_delays_with_weather")
cols = dfDelays.columns

In [4]:
display(dfDelays)

OriginAirportCode,Month,DayofMonth,CRSDepHour,DayOfWeek,Carrier,DestAirportCode,DepDel15,WindSpeed,SeaLevelPressure,HourlyPrecip
SAT,5,1,13,3,MQ,ORD,0,6.0,29.8,0.0
SAT,5,2,13,4,MQ,ORD,0,22.0,30.18,0.0
SAT,5,3,13,5,MQ,ORD,1,14.0,30.18,0.0
SAT,5,4,13,6,MQ,ORD,0,6.0,29.93,0.0
SAT,5,5,13,7,MQ,ORD,0,9.0,30.03,0.0
SAT,5,6,13,1,MQ,ORD,0,7.0,30.01,0.0
SAT,5,7,13,2,MQ,ORD,0,10.0,29.97,0.0
SAT,5,8,13,3,MQ,ORD,1,14.0,29.91,0.0
SAT,5,9,13,4,MQ,ORD,0,7.0,29.85,0.0
SAT,5,10,13,5,MQ,ORD,1,11.0,29.87,0.0


In [5]:
%sql
select OriginAirportCode, DestAirportCode, count(DepDel15)
from flight_delays_with_weather where DepDel15 = 1
group by OriginAirportCode, DestAirportCode
ORDER BY count(DepDel15) desc

OriginAirportCode,DestAirportCode,count(DepDel15)
LAX,SFO,3385
SFO,LAX,2690
ATL,LGA,2193
ORD,LGA,2003
ORD,SFO,1925
LAS,SFO,1706
LGA,ATL,1701
SAN,SFO,1686
ORD,EWR,1683
LAX,LAS,1632


Sampling the data

In [7]:
dfDelays.groupBy("DepDel15").count().show()

In [8]:
fractions = {0: .30, 1: 1.0}
trainingSample = dfDelays.sampleBy("DepDel15", fractions, 36)
trainingSample.groupBy("DepDel15").count().show()

Select an algorithm and transform features

In [10]:
categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

In [11]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Month", "DayofMonth", "CRSDepHour", "DayOfWeek", "WindSpeed", "SeaLevelPressure", "HourlyPrecip"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

Create and train the Decision Tree model

In [13]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = trainingSample.randomSplit([0.7, 0.3], seed=100)
# We want to have two copies of the training and testing data, since the pipeline runs transformations and we want to run a couple different iterations
trainingData2 = trainingData
testData2 = testData
print(trainingData.count())
print(testData.count())

In [14]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)
stages += [dt]

# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(trainingData)
trainingData = pipelineModel.transform(trainingData)
# Keep relevant columns
selectedcols = ["label", "features"] + cols
trainingData = trainingData.select(selectedcols)
display(trainingData)

label,features,OriginAirportCode,Month,DayofMonth,CRSDepHour,DayOfWeek,Carrier,DestAirportCode,DepDel15,WindSpeed,SeaLevelPressure,HourlyPrecip
0.0,"List(0, 157, List(45, 70, 86, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 1.0, 6.0, 3.0, 7.0, 29.799999237060547))",ABQ,5,1,6,3,OO,PHX,0,7.0,29.8,0.0
0.0,"List(0, 157, List(45, 70, 104, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 1.0, 6.0, 3.0, 7.0, 29.799999237060547))",ABQ,5,1,6,3,OO,SLC,0,7.0,29.8,0.0
0.0,"List(0, 157, List(45, 67, 86, 150, 151, 152, 153, 155), List(1.0, 1.0, 1.0, 5.0, 1.0, 8.0, 3.0, 29.799999237060547))",ABQ,5,1,8,3,US,PHX,0,0.0,29.8,0.0
1.0,"List(0, 157, List(45, 70, 84, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 1.0, 9.0, 3.0, 5.0, 29.799999237060547))",ABQ,5,1,9,3,OO,DEN,1,5.0,29.8,0.0
0.0,"List(0, 157, List(45, 70, 104, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 2.0, 6.0, 4.0, 23.0, 30.350000381469727))",ABQ,5,2,6,4,OO,SLC,0,23.0,30.35,0.0
0.0,"List(0, 157, List(45, 66, 107, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 2.0, 8.0, 4.0, 18.0, 30.40999984741211))",ABQ,5,2,8,4,UA,IAD,0,18.0,30.41,0.0
1.0,"List(0, 157, List(45, 70, 91, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 2.0, 16.0, 4.0, 3.0, 30.309999465942383))",ABQ,5,2,16,4,OO,IAH,1,3.0,30.31,0.0
0.0,"List(0, 157, List(45, 67, 86, 150, 151, 152, 153, 155), List(1.0, 1.0, 1.0, 5.0, 3.0, 8.0, 5.0, 30.309999465942383))",ABQ,5,3,8,5,US,PHX,0,0.0,30.31,0.0
1.0,"List(0, 157, List(45, 70, 81, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 3.0, 12.0, 5.0, 10.0, 30.1299991607666))",ABQ,5,3,12,5,OO,ORD,1,10.0,30.13,0.0
0.0,"List(0, 157, List(45, 70, 86, 150, 151, 152, 153, 154, 155), List(1.0, 1.0, 1.0, 5.0, 3.0, 12.0, 5.0, 10.0, 30.1299991607666))",ABQ,5,3,12,5,OO,PHX,0,10.0,30.13,0.0


In [15]:
# Make predictions on test data using the Transformer.transform() method.
predictions = pipelineModel.transform(testData)

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [18]:
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# Run cross validations (this can take several minutes to execute)
cvModel = cv.fit(trainingData2)

In [19]:
predictions = cvModel.transform(testData2)

In [20]:
evaluator.evaluate(predictions)

In [21]:
selected = predictions.select("label", "prediction", "probability", "OriginAirportCode", "DestAirportCode")
display(selected)

label,prediction,probability,OriginAirportCode,DestAirportCode
0.0,1.0,"List(1, 2, List(), List(0.43960711204899156, 0.5603928879510084))",ABQ,ORD
0.0,0.0,"List(1, 2, List(), List(0.6991391413245519, 0.3008608586754481))",ABQ,PHX
0.0,0.0,"List(1, 2, List(), List(0.6991391413245519, 0.3008608586754481))",ABQ,LAX
0.0,1.0,"List(1, 2, List(), List(0.43960711204899156, 0.5603928879510084))",ABQ,SLC
1.0,1.0,"List(1, 2, List(), List(0.43960711204899156, 0.5603928879510084))",ABQ,IAH
0.0,1.0,"List(1, 2, List(), List(0.43960711204899156, 0.5603928879510084))",ABQ,SFO
0.0,0.0,"List(1, 2, List(), List(0.6991391413245519, 0.3008608586754481))",ABQ,SLC
1.0,1.0,"List(1, 2, List(), List(0.43960711204899156, 0.5603928879510084))",ABQ,SFO
1.0,0.0,"List(1, 2, List(), List(0.6991391413245519, 0.3008608586754481))",ABQ,IAD
0.0,0.0,"List(1, 2, List(), List(0.6991391413245519, 0.3008608586754481))",ABQ,LAX


In [22]:
bestModel = cvModel.bestModel
finalPredictions = bestModel.transform(dfDelays)
evaluator.evaluate(finalPredictions)

Save the model

In [24]:
# Save the best model under /dbfs/flightDelayModel
bestModel.write().overwrite().save("/flightDelayModel")