In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("AirlineDataAnalysis") \
    .getOrCreate()

print("Spark initialized successfully!")


Spark initialized successfully!


In [13]:
from pyspark.sql.functions import *

In [11]:
# Step 1: Data Cleaning
# Select relevant columns for the analysis

columns_needed = ["origin", "dest", "fl_date"]
missing_columns = [col for col in columns_needed if col not in data.columns]
if missing_columns:
    raise Exception(f"Missing required columns: {missing_columns}")

# Select and clean the relevant columns
data_cleaned = data.select(*columns_needed).dropna()

# Trim whitespace and standardize case for origin and destination columns
data_cleaned = data_cleaned.withColumn("origin", trim(lower(col("origin")))) \
                           .withColumn("dest", trim(lower(col("dest"))))

print("Data cleaned successfully!")
data_cleaned.show(5)

NameError: name 'data' is not defined

In [59]:
# Step 2: Create Route Column
# Combine origin and destination into a single Route column
data_routes = data_cleaned.withColumn("route", concat_ws("-", col("origin"), col("dest")))

In [69]:
# Step 3: Analyze Top 5 Routes
# Count flights per route
route_counts = data_routes.groupBy("route").agg(count("*").alias("flight_count")).orderBy(desc("flight_count"))

In [71]:
# Show top 5 routes
top_routes = route_counts.limit(5)
print("Top 5 Routes by Flight Count:")
top_routes.show()

Top 5 Routes by Flight Count:




+-------+------------+
|  route|flight_count|
+-------+------------+
|sfo-lax|       48585|
|lax-sfo|       47680|
|jfk-lax|       38434|
|lax-jfk|       38424|
|las-lax|       33166|
+-------+------------+



                                                                                

In [73]:
# Step 4: Additional Analysis - Monthly Flight Trends
# Extract year and month from fl_date
data_routes = data_routes.withColumn("year", year(col("fl_date"))) \
                         .withColumn("month", month(col("fl_date")))

In [75]:
# Group by year, month, and route
monthly_trends = data_routes.groupBy("year", "month", "route").agg(count("*").alias("monthly_flight_count"))
print("Monthly flight trends:")
monthly_trends.show(10)

Monthly flight trends:




+----+-----+-------+--------------------+
|year|month|  route|monthly_flight_count|
+----+-----+-------+--------------------+
|2015|    1|cvg-ord|                 328|
|2015|    1|dsm-ord|                 303|
|2015|    1|sea-fat|                  64|
|2015|    1|bzn-sfo|                   5|
|2015|    1|hnl-ewr|                  31|
|2015|    1|phl-sfo|                 178|
|2015|    1|sfo-fll|                 136|
|2015|    1|fll-stl|                  67|
|2015|    1|hou-clt|                  31|
|2015|    1|mci-lax|                  63|
+----+-----+-------+--------------------+
only showing top 10 rows



                                                                                

In [79]:
import shutil
import os

# Local paths for saving results
local_top_routes_path = "/Users/apple/Documents/CloudTech/Cloud Assignment/top_routes"
local_monthly_trends_path = "/Users/apple/Documents/CloudTech/Cloud Assignment/monthly_trends"

# Delete existing directories if they exist
if os.path.exists(local_top_routes_path):
    shutil.rmtree(local_top_routes_path)

if os.path.exists(local_monthly_trends_path):
    shutil.rmtree(local_monthly_trends_path)

# Save the top 5 routes and monthly trends to local paths
try:
    top_routes.write.csv(local_top_routes_path, header=True, mode="overwrite")
    monthly_trends.write.csv(local_monthly_trends_path, header=True, mode="overwrite")
    print(f"Analysis results saved successfully at:\n{local_top_routes_path}\n{local_monthly_trends_path}")
except Exception as e:
    print(f"Error saving results: {e}")




Analysis results saved successfully at:
/Users/apple/Documents/CloudTech/Cloud Assignment/top_routes
/Users/apple/Documents/CloudTech/Cloud Assignment/monthly_trends


                                                                                