In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MR-DelayRate-Notebook") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
# CSV dosya yolunu kendine göre düzenle
INPUT_CSV = "data/airlines_delay.csv"

# Spark output bir klasör ister
OUTPUT_DIR = "outputs/mr_delay_rate_by_airline"

# İlk birkaç satırı görelim
rdd_raw = sc.textFile(INPUT_CSV)
rdd_raw.take(5)

In [None]:
import csv
from io import StringIO

def parse_line(line: str):
    # Beklenen kolonlar:
    # Flight, Time, Length, Airline, AirportFrom, AirportTo, DayOfWeek, Class
    try:
        row = next(csv.reader(StringIO(line)))
        if len(row) != 8:
            return None

        airline = row[3].strip()
        delayed = int(row[7].strip())  # Class: 0/1

        return (airline, (delayed, 1))
    except Exception:
        return None

header = rdd_raw.first()
rdd = rdd_raw.filter(lambda x: x != header)

kv = rdd.map(parse_line).filter(lambda x: x is not None)

kv.take(5)

In [None]:
# (Airline, (sum_delayed, sum_total))
agg = kv.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# (Airline, delayed, total, rate)
result_rdd = agg.map(lambda x: (x[0], x[1][0], x[1][1], (x[1][0] / x[1][1]) if x[1][1] else 0.0))

result_rdd.take(10)

In [None]:
from pyspark.sql import Row

df = result_rdd.map(lambda x: Row(Airline=x[0], delayed_count=int(x[1]), total_count=int(x[2]), delay_rate=float(x[3]))).toDF()

df.orderBy("delay_rate", ascending=False).show(10, truncate=False)

In [None]:
total_rows = rdd.count()
total_from_mr = df.agg({"total_count": "sum"}).collect()[0][0]
delayed_from_mr = df.agg({"delayed_count": "sum"}).collect()[0][0]

print("Rows (header excluded):", total_rows)
print("Sum(total_count) from MR:", total_from_mr)
print("Sum(delayed_count) from MR:", delayed_from_mr)

In [None]:
import os
import shutil

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

# Tab-separated text output
result_rdd.map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}\t{x[3]}") \
          .coalesce(1) \
          .saveAsTextFile(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)