In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import when

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
spark = SparkSession.builder\
                    .appName("FlightDelayPrediction")\
                    .getOrCreate()

In [7]:
# Loading dataset
df = spark.read.csv("PATH_TO_FLIGHT_DATASET", header=True, inferSchema=True)
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- FlightDate: date (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Marketing_Airline: integer (nullable = true)
 |-- IATA_Code_Marketing_Airline: string (nullable = true)
 |-- Flight_Number_Marketing_Airline: integer (nullable = true)
 |-- Originally_Scheduled_Code_Share_Airline: string (nullable = true)
 |-- DOT_ID_Originally_Scheduled_Code_Share_Airline: double (nullable = true)
 |-- IATA_Code_Originally_Scheduled_Code_Share_Airline: string (nullable = true)
 |-- Flight_Num_Originally_Scheduled_Code_Share_Airline: double (nullable = true)
 |-- Operating_Airline : string (nullable = true)
 |-- DOT_ID_Operating_Airline: integer (nullable = true)
 |-- IATA_Code_Operating_Air

In [8]:
# Required columns for the prediction
requiredColumns = [
    "DayofMonth", "Month", "DayOfWeek", # Flight date details
    "Operating_Airline ", # Airline details
    "Origin", "Dest", # Airport details
    "CRSArrTime", "ArrDel15", "CRSDepTime", "DepDel15", # Time details
]

# Dropping rows with missing values in specific columns
df = df.dropna(subset=requiredColumns)

# Keeping necessary columns and dropping others
df = df[requiredColumns]

# Renaming columns
df = df.withColumnRenamed("Operating_Airline ","OperatingAirline")
df = df.withColumnRenamed("CRSDepTime","ScheduledDepTime")
df = df.withColumnRenamed("CRSArrTime","ScheduledArrTime")

# Creating a column that states whether the flight is delayed
df = df.withColumn("IsDelayed", when((df["ArrDel15"] == "1.0") | (df["DepDel15"] == "1.0"), 1.0).otherwise(0.0))


In [10]:
# Encode categorical variables
categoricalCols = ["OperatingAirline", "Origin", "Dest"]
indexers = [StringIndexer(inputCol=col, outputCol=col+'Index').fit(df)
           for col in categoricalCols]
for indexer in indexers:
    df = indexer.transform(df)

# Select features and target variable
featureCols = ['OperatingAirlineIndex', 'OriginIndex', 'DestIndex']

assembler = VectorAssembler(inputCols=featureCols, outputCol="features2")
df = assembler.transform(df)

df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- OperatingAirline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- ScheduledArrTime: integer (nullable = true)
 |-- ArrDel15: double (nullable = true)
 |-- ScheduledDepTime: integer (nullable = true)
 |-- DepDel15: double (nullable = true)
 |-- IsDelayed: double (nullable = false)
 |-- OperatingAirlineIndex: double (nullable = false)
 |-- OriginIndex: double (nullable = false)
 |-- DestIndex: double (nullable = false)
 |-- features2: vector (nullable = true)



In [11]:
model_df = df.select(["features2", "isDelayed"])
model_df.show(truncate=False)

+-------------+---------+
|features2    |isDelayed|
+-------------+---------+
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|1.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|0.0      |
|[7.0,6.0,4.0]|1.0      |
+-------------+---------+
only showing top 20 rows



In [12]:
# Split the dataset into training and testing sets
training_df, testing_df = model_df.randomSplit([0.7, 0.3])

print(training_df.count())
print(testing_df.count())

5017743
2149301


In [14]:
# Train the linear regression model
lr = LinearRegression(featuresCol='features2', labelCol='isDelayed')
lr_model = lr.fit(training_df)

# Evaluate the model
train_predictions = lr_model.transform(training_df)
test_predictions = lr_model.transform(testing_df)

evaluator = RegressionEvaluator(
    labelCol="isDelayed", predictionCol="prediction", metricName="r2")
train_r2 = evaluator.evaluate(train_predictions)
test_r2 = evaluator.evaluate(test_predictions)

print("Training R^2 Score:", train_r2)
print("Testing R^2 Score:", test_r2)

Training R^2 Score: 0.00293397229804071
Testing R^2 Score: 0.002859896629911929


In [16]:
regressor = LinearRegression(featuresCol='features2', labelCol='isDelayed')
regressor = regressor.fit(training_df)


In [17]:
pred_results = regressor.evaluate(testing_df)
pred_results.predictions.show(30)

+-------------+---------+------------------+
|    features2|isDelayed|        prediction|
+-------------+---------+------------------+
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,1.0]|      0.0|0.2691172897964861|
|[1.0,0.0,

In [18]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError, pred_results.r2

(0.36882295429174566, 0.18435445143862492, 0.002859896629911929)