In [1]:
import pyspark
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import split, regexp_extract, to_date,when, col, lower

from datetime import datetime
import re

In [2]:
spark = SparkSession.builder.appName('bovo').getOrCreate()
spark

In [3]:

schema = StructType([
    StructField("PredictedArrival", StringType(), False),
    StructField("Flight_ID", StringType(), False),
    StructField("Destination", StringType(), False),
    StructField("Airline", StringType(), False),
    StructField("Aircraft_ID", StringType(), False),
    StructField("NUll", StringType(), False),
    StructField("TimeOfArrival", StringType(), False),
    StructField("Airport", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("TimeDeff", StringType(), False)
    
])

In [4]:
df_pyspark=spark.read.option('header','false').csv(r'C:\Users\httyd\Desktop\capstone\airports\Data\2024-03-21\Arrivals*.csv',schema=schema)

In [5]:
df_pyspark=df_pyspark.na.drop(how="any", thresh=5)
df_pyspark = df_pyspark.drop('NULL')

In [6]:
df_pyspark.show(100)

+----------------+---------+--------------------+--------------------+-------------+-------------+--------------------+--------------------+--------+
|PredictedArrival|Flight_ID|         Destination|             Airline|  Aircraft_ID|TimeOfArrival|             Airport|                Date|TimeDeff|
+----------------+---------+--------------------+--------------------+-------------+-------------+--------------------+--------------------+--------+
|           17:06|   LXJ338| Grand Rapids (GRR)-|           Flexjet -|E545 (N338FX)| Landed 17:08|Akron Canton Airp...|Tuesday, Mar 19 2024| 4:00:00|
|           17:07|   UA5358|      Chicago (ORD)-|    United Express -|CRJ2 (N932EV)| Landed 18:22|Akron Canton Airp...|Tuesday, Mar 19 2024| 4:00:00|
|           17:18|   LXJ439|   Washington (IAD)-|           Flexjet -|E545 (N439FX)|      Unknown|Akron Canton Airp...|Tuesday, Mar 19 2024| 4:00:00|
|           19:48|    MX207|    Las Vegas (LAS)-|    Breeze Airways -|BCS3 (N215BZ)| Landed 19:22|Ak

In [7]:
df_pyspark = df_pyspark.withColumn("Day", split(df_pyspark["Date"], ",").getItem(0)) \
                       .withColumn("NumericalDate", split(df_pyspark["Date"], ",").getItem(1))
df_pyspark = df_pyspark.drop("Date")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("AirportName", regexp_extract(df_pyspark["Airport"], r'^(.*?)\(', 1)) \
                       .withColumn("Airport_Code", regexp_extract(df_pyspark["Airport"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Airport")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("DestinationName", regexp_extract(df_pyspark["Destination"], r'^(.*?)\(', 1)) \
                       .withColumn("Destination_Code", regexp_extract(df_pyspark["Destination"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Destination")
# Extract AirportName and AirportCode using regexp_extract
df_pyspark = df_pyspark.withColumn("Aircraft_Type", regexp_extract(df_pyspark["Aircraft_ID"], r'^(.*?)\(', 1)) \
                       .withColumn("Aircraft_Code", regexp_extract(df_pyspark["Aircraft_ID"], r'\((.*?)\)', 1))
df_pyspark = df_pyspark.drop("Aircraft_ID")

# Split TimeOfDeparture based on whether it contains "Departed", "Estimated", or "Canceled"
df_pyspark = df_pyspark.withColumn("ArrivalStatus", 
                                   when(col("TimeOfArrival").contains("Landed"), "Landed")
                                   .when(col("TimeOfArrival").contains("Canceled"), "Canceled")
                                   .otherwise("Unknown"))

# Split TimeOfDeparture into two columns based on the DepartureStatus
df_pyspark = df_pyspark.withColumn("ActualArrivalTime", when(col("ArrivalStatus") == "Landed",
                                                               split(col("TimeOfArrival"), " ")[1])
                                   .otherwise(None))

df_pyspark = df_pyspark.drop("TimeOfArrival")

df_pyspark = df_pyspark.withColumn("Airline", split(df_pyspark["Airline"], " -").getItem(0))



In [8]:
df_pyspark = df_pyspark.dropDuplicates()
row_count = df_pyspark.count()
print("Number of rows in DataFrame Dep:", row_count)

Number of rows in DataFrame: 173


In [9]:
# Assuming df_pyspark is your DataFrame containing the sample data
df_pyspark = df_pyspark.withColumn("NumericalDate", to_date("NumericalDate", " MMM dd yyyy"))

# Show the transformed DataFrame
df_pyspark.show()

+----------------+---------+--------------------+--------+---------+-------------+--------------------+------------+---------------+----------------+-------------+-------------+-------------+-----------------+
|PredictedArrival|Flight_ID|             Airline|TimeDeff|      Day|NumericalDate|         AirportName|Airport_Code|DestinationName|Destination_Code|Aircraft_Type|Aircraft_Code|ArrivalStatus|ActualArrivalTime|
+----------------+---------+--------------------+--------+---------+-------------+--------------------+------------+---------------+----------------+-------------+-------------+-------------+-----------------+
|           21:21|   EJA767|             NetJets| 4:00:00|Wednesday|   2024-03-20|Akron Canton Airport|    CAK/KCAK|     Morganton |             MRN|        CL35 |       N767QS|       Landed|            20:38|
|           13:18|   AA5218|      American Eagle| 4:00:00|Wednesday|   2024-03-20|Akron Canton Airport|    CAK/KCAK|     Charlotte |             CLT|        CRJ

In [10]:
df_pyspark = df_pyspark.withColumn("Flight_ID", lower(df_pyspark["Flight_ID"]))
df_pyspark = df_pyspark.withColumn("Day", lower(df_pyspark["Day"]))
df_pyspark = df_pyspark.withColumn("Airline", lower(df_pyspark["Airline"]))
df_pyspark = df_pyspark.withColumn("AirportName", lower(df_pyspark["AirportName"]))
df_pyspark = df_pyspark.withColumn("Airport_Code", lower(df_pyspark["Airport_Code"]))
df_pyspark = df_pyspark.withColumn("DestinationName", lower(df_pyspark["DestinationName"]))
df_pyspark = df_pyspark.withColumn("Destination_Code", lower(df_pyspark["Destination_Code"]))
df_pyspark = df_pyspark.withColumn("Aircraft_Type", lower(df_pyspark["Aircraft_Type"]))
df_pyspark = df_pyspark.withColumn("Aircraft_Code", lower(df_pyspark["Aircraft_Code"]))
df_pyspark = df_pyspark.withColumn("ArrivalStatus", lower(df_pyspark["ArrivalStatus"]))

In [11]:
df_pyspark.show(100)

+----------------+---------+--------------------+---------------+---------+-------------+--------------------+------------+--------------------+----------------+-------------+-------------+-------------+-----------------+
|PredictedArrival|Flight_ID|             Airline|       TimeDeff|      Day|NumericalDate|         AirportName|Airport_Code|     DestinationName|Destination_Code|Aircraft_Type|Aircraft_Code|ArrivalStatus|ActualArrivalTime|
+----------------+---------+--------------------+---------------+---------+-------------+--------------------+------------+--------------------+----------------+-------------+-------------+-------------+-----------------+
|           21:21|   eja767|             netjets|        4:00:00|wednesday|   2024-03-20|akron canton airport|    cak/kcak|          morganton |             mrn|        cl35 |       n767qs|       landed|            20:38|
|           13:18|   aa5218|      american eagle|        4:00:00|wednesday|   2024-03-20|akron canton airport|  