## About Data
- **Name** : Flight Status Prediction
- **Source** : Kaggle (https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022)

##### Description
- This dataset contains all flight information including cancellation and delays by airline for dates back to January 2018 to year 2022. For your convenience you can use the Combined_Flights_XXXX.csv or Combined_Flights_XXXX.parquet files to access the combined data for the entire year.

##### Columns
- The DataSet has total of 61 columns and over 25M records.




### Importing Required Libraries

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType
from pyspark.sql.functions import *

### Creating SparkSession with appName "Data603_FinalProject"

In [None]:
spark = SparkSession.builder.appName('Data603_FinalProject').getOrCreate()

### Importing Data from DBFS (Here we are choosing .parquet files because size issues)

In [None]:
df = spark.read.parquet('/FileStore/tables/*.parquet')

### Total Number of Records

In [None]:
df.count()

### DataFrame Overview

In [None]:
display(df)

### Schema of the DataFrame

In [None]:
df.printSchema()

# Data Cleaning

#### Finding Null Values

In [None]:
def findMissing():
    from pyspark.sql.functions import col, isnan, when
    total_count = 0
    for column in df.columns:
        null_count = df[df[column].isNull()].count()
        total_count = total_count + null_count
        print(f"Number of null or NA values in column {column}: {null_count}")
    
    print(f'Total Null Values: {total_count}, Null Values Percentage: {total_count * 100/ df.count()}')

#### Count of Null Values by Column

In [None]:
findMissing()

###### Filling null values of DepDelayMinutes as DepDelay and DepDelayMinutes will be 0 if there is no Delay. 
###### Note: We considered DepTime as NotNull because if incase the flight has canceled then it may not have DepTime at all.

In [None]:
df = df.withColumn("DepDelayMinutes", when((col("DepDelay").isNull()) & (col("DepDelayMinutes").isNull()) & (col("DepTime").isNotNull()), 0).otherwise(col("DepDelayMinutes")))

df = df.withColumn("DepDelay", when((col("DepDelay").isNull()) & (col("DepDelayMinutes").isNull()) & (col("DepTime").isNotNull()), 0).otherwise(col("DepDelay")))

#### Null value count after replacing with 0

In [None]:
null_count_depdelay_minutes = df.where(col("DepDelayMinutes").isNull()).count()
print(f"Count of null values in 'DepDelayMinutes': {null_count_depdelay_minutes}")

null_count_depdelay = df.where(col("DepDelay").isNull()).count()
print(f"Count of null values in 'DepDelay': {null_count_depdelay}")


#### Filling null values if DepTime is Null which means the flight has canceled

In [None]:
df = df.withColumn("DepDelayMinutes", when((col("DepDelay").isNull()) & (col("DepDelayMinutes").isNull()) & (col("DepTime").isNull()), 0).otherwise(col("DepDelayMinutes")))

df = df.withColumn("DepDelay", when((col("DepDelay").isNull()) & (col("DepDelayMinutes") == 0) & (col("DepTime").isNull()), 0).otherwise(col("DepDelay")))

df = df.withColumn("DepTime", when((col("DepDelay") == 0) & (col("DepDelayMinutes") == 0) & (col("DepTime").isNull()), 0).otherwise(col("DepTime")))


null_count_depdelay_minutes = df.where(col("DepDelayMinutes").isNull()).count()
print(f"Count of null values in 'DepDelayMinutes': {null_count_depdelay_minutes}")

null_count_depdelay = df.where(col("DepDelay").isNull()).count()
print(f"Count of null values in 'DepDelay': {null_count_depdelay}")

null_count_deptime = df.where(col("DepTime").isNull()).count()
print(f"Count of null values in 'DepTime': {null_count_deptime}")

#### Filling null values of DepDelay where CRSDepTime == DepTime. This means the flight is on time

In [None]:
df = df.withColumn("DepDelay", when(col("CRSDepTime") == col("DepTime"), 0).otherwise(col("DepDelay")))

# Featuring Engineering

#### Creating Delayed and EarlyDep using DepDelay

In [None]:
df = df.withColumn("Delayed", when((col("DepDelay") > 0) | (col("DepDelay") == 0), 0).otherwise(1))
df = df.withColumn("EarlyDep", when((col("DepDelay") < 0) | (col("DepDelay") == 0), 0).otherwise(1))

In [None]:
display(df)

#### Encoding Cancelled and Diverted using StringIndexer

In [None]:
df = df.withColumn("Cancelled", col("Cancelled").cast("string"))
df = df.withColumn("Diverted", col("Diverted").cast("string"))

In [None]:
from pyspark.ml.feature import StringIndexer

# Create a StringIndexer
indexer_C = StringIndexer(inputCol="Cancelled", outputCol="Cancelled_Flight")

# Fit and transform the DataFrame
df = indexer_C.fit(df).transform(df)

In [None]:
from pyspark.ml.feature import StringIndexer

# Create a StringIndexer
indexer_D = StringIndexer(inputCol="Diverted", outputCol="Diverted_Flight")

# Fit and transform the DataFrame
df = indexer_D.fit(df).transform(df)

In [None]:
display(df)

#### Dropping unwanted Columns

In [None]:
df = df.drop("Cancelled")
df = df.drop("Diverted")
df = df.drop("FlightDate")

In [None]:
df = df.drop("DepDelayMinutes")
df = df.drop("DepDelay")

In [None]:
df = df.drop("Operated_or_Branded_Code_Share_Partners")
df = df.drop("DOT_ID_Marketing_Airline")
df = df.drop("IATA_Code_Marketing_Airline")
df = df.drop("Flight_Number_Marketing_Airline")
df = df.drop("Operating_Airline")
df = df.drop("DOT_ID_Operating_Airline")
df = df.drop("IATA_Code_Operating_Airline")
df = df.drop("Tail_Number")
df = df.drop("Flight_Number_Operating_Airline")
df = df.drop("OriginAirportID")
df = df.drop("OriginAirportSeqID")
df = df.drop("OriginCityMarketID")
df = df.drop("OriginStateFips")
df = df.drop("OriginWac")
df = df.drop("DestAirportID")
df = df.drop("DestAirportSeqID")
df = df.drop("DestCityMarketID")
df = df.drop("DestStateFips")
df = df.drop("DestWac")
df = df.drop("DepTimeBlk")
df = df.drop("ArrTimeBlk")

In [None]:
findMissing()

#### Creating 'AirTraffic' column based on total of flights that are scheduled on the same hour, day, month, year.

In [None]:
df = df.withColumn("DepTime", col("DepTime").cast('string'))

df = df.withColumn("hour", when(length(col("DepTime")) == 6, col("DepTime").substr(1, 2)).otherwise(col("DepTime").substr(1, 1)))

In [None]:
df = df.drop("DepTime")

In [None]:
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("Origin", "hour", "DayofMonth", "Month", "Year")

df = df.withColumn("AirTraffic", count("*").over(windowSpec))

In [None]:
df = df.drop("DivAirportLandings")

#### Displaying Final DataFrame

In [None]:
display(df)

# Exploratory Data Analysis

#### Number of flights per year

In [None]:
#Number of flights per year
from pyspark.sql.functions import col, isnan, when,count,desc,expr,substring,length,lpad
flights_per_year = df.groupBy("year").agg(count("year").alias("num_flights")).toPandas()
flights_per_year

In [None]:
#Plotting graph for yearly flights
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,10))
sns.barplot(x = 'year', y = 'num_flights', data =flights_per_year)
plt.title("Flights on yearly basis", size = 20)

#### Number of flights per month

In [None]:
#Number of flights per month
flights_per_month = df.groupBy("Month").agg(count("month").alias("num_flights_monthly")).toPandas()
flights_per_month

In [None]:
#Plotting graph for monthly flights
plt.figure(figsize=(15,10))
sns.barplot(x = 'Month', y = 'num_flights_monthly', data =flights_per_month)
plt.title("Flights on monthly basis", size = 20)

#### Nmuber of flights per quater of a year

In [None]:
#Nmuber of flights per quater of a year
flights_per_quarter = df.groupBy("Quarter").agg(count("Quarter").alias("num_flights_quarterly")).toPandas()
flights_per_quarter

In [None]:
#Plotting graph for Quaterly flights
plt.figure(figsize=(15,10))
sns.barplot(x = 'Quarter', y = 'num_flights_quarterly', data =flights_per_quarter)
plt.title("Flights on Quarterly basis", size = 20)

#### Nmuber of Flights in a Day of a Week

In [None]:
df_day = df.groupBy('DayOfWeek').count().sort(desc("count")).toPandas()
df_day

#### Busiest Day of the Week

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x = 'DayOfWeek',y = 'count',data = df_day,color= "#04D8B2")
plt.title("Busiest Day of the Week", size = 20)

#### Fight Count Based on Origin and Destination

In [None]:
df_o = df.groupBy('Origin').count().sort(desc("count")).toPandas()
(df_o)

In [None]:
df_d = df.groupBy('Dest').count().sort(desc("count")).toPandas()
(df_d)

#### No of Flights that are canceled

In [None]:
#Number of flights cancelled
df_c = df.groupBy('Cancelled_Flight').count().sort(desc("count")).toPandas()
(df_c)


#### Scatter plot between Canceled Flights and Air Traffic

In [None]:
df_pd = df.select('Cancelled_Flight', 'AirTraffic').toPandas()

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df_pd['AirTraffic'], df_pd['Cancelled_Flight'], alpha=0.5)
plt.title('Canceled Flights vs Air Traffic')
plt.xlabel('Air Traffic')
plt.ylabel('Canceled Flights')
plt.grid(True)
plt.show()

#### Heatmap corealtion between DataFrame to determine the dependencies between features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col

numeric_cols = [col_name for col_name, col_type in df.dtypes if col_type in ['double', 'bigint', 'int']]

# Select only numeric columns
numeric = df.select(*numeric_cols).sample(False, 0.1).toPandas()

correlationMatrix = numeric.corr()

plt.figure(figsize=(20, 10))
sns.heatmap(correlationMatrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.show()


#### To predict cancellation we are going to take AirTraffic as target value.