In [1]:
import pyspark
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import split, regexp_extract, to_date,when, col, lower

from datetime import datetime
import re

In [2]:
spark = SparkSession.builder.appName('bovo').getOrCreate()
spark

In [3]:

schema = StructType([
    StructField("PredictedDeparture", StringType(), False),
    StructField("Flight_ID", StringType(), False),
    StructField("Destination", StringType(), False),
    StructField("Airline", StringType(), False),
    StructField("Aircraft_ID", StringType(), False),
    StructField("NUll", StringType(), False),
    StructField("TimeOfDeparture", StringType(), False),
    StructField("Airport", StringType(), False),
    StructField("Date", StringType(), False)
    
    
])

In [4]:
df_pyspark=spark.read.option('header','false').csv(r'C:\Users\httyd\Desktop\capstone\airports\Data\2024-03-14\Departures*.csv',schema=schema)

In [5]:
df_pyspark=df_pyspark.na.drop(how="any", thresh=5)
df_pyspark = df_pyspark.drop('NULL')

In [6]:
df_pyspark.show(100)

+------------------+---------+--------------------+--------------------+--------------+---------------+--------------------+--------------------+
|PredictedDeparture|Flight_ID|         Destination|             Airline|   Aircraft_ID|TimeOfDeparture|             Airport|                Date|
+------------------+---------+--------------------+--------------------+--------------+---------------+--------------------+--------------------+
|             15:25|   DL1614|        Tampa (TPA)-|   Delta Air Lines -| B752 (N682DA)| Departed 15:49|Atlanta Hartsfiel...|Wednesday, Mar 13...|
|             15:25|   DL2392|  Gainesville (GNV)-|   Delta Air Lines -| B712 (N929AT)| Departed 15:37|Atlanta Hartsfiel...|Wednesday, Mar 13...|
|             15:25|   WN3236|   Washington (DCA)-|Southwest Airlines -| B737 (N489WN)| Departed 15:42|Atlanta Hartsfiel...|Wednesday, Mar 13...|
|             15:26|   AA4378|   Washington (DCA)-|    American Eagle -| E75S (N115HQ)| Departed 15:35|Atlanta Hartsfiel...|

In [7]:
df_pyspark = df_pyspark.withColumn("Day", split(df_pyspark["Date"], ",").getItem(0)) \
                       .withColumn("NumericalDate", split(df_pyspark["Date"], ",").getItem(1))
df_pyspark = df_pyspark.drop("Date")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("AirportName", regexp_extract(df_pyspark["Airport"], r'^(.*?)\(', 1)) \
                       .withColumn("Airport_Code", regexp_extract(df_pyspark["Airport"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Airport")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("DestinationName", regexp_extract(df_pyspark["Destination"], r'^(.*?)\(', 1)) \
                       .withColumn("Destination_Code", regexp_extract(df_pyspark["Destination"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Destination")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("Aircraft_Type", regexp_extract(df_pyspark["Aircraft_ID"], r'^(.*?)\(', 1)) \
                       .withColumn("Aircraft_Code", regexp_extract(df_pyspark["Aircraft_ID"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Aircraft_ID")

# Split TimeOfDeparture based on whether it contains "Departed", "Estimated", or "Canceled"
df_pyspark = df_pyspark.withColumn("DepartureStatus", 
                                   when(col("TimeOfDeparture").contains("Departed"), "Departed")
                                   .when(col("TimeOfDeparture").contains("Estimated"), "Estimated")
                                   .when(col("TimeOfDeparture").contains("Canceled"), "Canceled")
                                   .otherwise("Unknown"))

# Split TimeOfDeparture into two columns based on the DepartureStatus
df_pyspark = df_pyspark.withColumn("ActualDepartureTime", when(col("DepartureStatus") == "Departed",
                                                               split(col("TimeOfDeparture"), " ")[1])
                                   .otherwise(None))

df_pyspark = df_pyspark.withColumn("EstimatedDepartureTime", when(col("DepartureStatus") == "Estimated",
                                                                  split(col("TimeOfDeparture"), "  ")[1])
                                   .otherwise(None))

df_pyspark = df_pyspark.drop("TimeOfDeparture")

df_pyspark = df_pyspark.withColumn("Airline", split(df_pyspark["Airline"], " -").getItem(0))



In [8]:
df_pyspark = df_pyspark.dropDuplicates()


In [9]:
row_count = df_pyspark.count()
print("Number of rows in DataFrame:", row_count)



Number of rows in DataFrame: 8772


In [10]:
from pyspark.sql.functions import to_date

# Assuming df_pyspark is your DataFrame containing the sample data
df_pyspark = df_pyspark.withColumn("NumericalDate", to_date("NumericalDate", " MMM dd yyyy"))

# Show the transformed DataFrame
df_pyspark.show()


+------------------+---------+--------------------+---------+-------------+--------------------+------------+---------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|PredictedDeparture|Flight_ID|             Airline|      Day|NumericalDate|         AirportName|Airport_Code|DestinationName|Destination_Code|Aircraft_Type|Aircraft_Code|DepartureStatus|ActualDepartureTime|EstimatedDepartureTime|
+------------------+---------+--------------------+---------+-------------+--------------------+------------+---------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|             17:02|   DL1629|   Delta Air Lines -|Wednesday|   2024-03-13|Atlanta Hartsfiel...|    ATL/KATL|     Knoxville |             TYS|        A319 |       N319NB|       Departed|              17:09|                  NULL|
|             23:35|     DL27|   Delta Air Lines -|Wednesday|   2024-03-13|Atlan

In [11]:
df_pyspark.show(500)

+------------------+---------+--------------------+---------+-------------+--------------------+------------+--------------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|PredictedDeparture|Flight_ID|             Airline|      Day|NumericalDate|         AirportName|Airport_Code|     DestinationName|Destination_Code|Aircraft_Type|Aircraft_Code|DepartureStatus|ActualDepartureTime|EstimatedDepartureTime|
+------------------+---------+--------------------+---------+-------------+--------------------+------------+--------------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|             17:02|   DL1629|   Delta Air Lines -|Wednesday|   2024-03-13|Atlanta Hartsfiel...|    ATL/KATL|          Knoxville |             TYS|        A319 |       N319NB|       Departed|              17:09|                  NULL|
|             23:35|     DL27|   Delta Air Lines -|Wednesday

In [12]:
df_pyspark = df_pyspark.withColumn("Flight_ID", lower(df_pyspark["Flight_ID"]))
df_pyspark = df_pyspark.withColumn("Day", lower(df_pyspark["Day"]))
df_pyspark = df_pyspark.withColumn("Airline", lower(df_pyspark["Airline"]))
df_pyspark = df_pyspark.withColumn("AirportName", lower(df_pyspark["AirportName"]))
df_pyspark = df_pyspark.withColumn("Airport_Code", lower(df_pyspark["Airport_Code"]))
df_pyspark = df_pyspark.withColumn("DestinationName", lower(df_pyspark["DestinationName"]))
df_pyspark = df_pyspark.withColumn("Destination_Code", lower(df_pyspark["Destination_Code"]))
df_pyspark = df_pyspark.withColumn("Aircraft_Type", lower(df_pyspark["Aircraft_Type"]))
df_pyspark = df_pyspark.withColumn("Aircraft_Code", lower(df_pyspark["Aircraft_Code"]))
df_pyspark = df_pyspark.withColumn("DepartureStatus", lower(df_pyspark["DepartureStatus"]))

In [14]:
df_pyspark.show(100)

+------------------+---------+--------------------+---------+-------------+--------------------+------------+-----------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|PredictedDeparture|Flight_ID|             Airline|      Day|NumericalDate|         AirportName|Airport_Code|  DestinationName|Destination_Code|Aircraft_Type|Aircraft_Code|DepartureStatus|ActualDepartureTime|EstimatedDepartureTime|
+------------------+---------+--------------------+---------+-------------+--------------------+------------+-----------------+----------------+-------------+-------------+---------------+-------------------+----------------------+
|             17:02|   dl1629|     delta air lines|wednesday|   2024-03-13|atlanta hartsfiel...|    atl/katl|       knoxville |             tys|        a319 |       n319nb|       departed|              17:09|                  NULL|
|             23:35|     dl27|     delta air lines|wednesday|   2024-03-