In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MR-DelayRate-Notebook") \
    .getOrCreate()

sc = spark.sparkContext

In [15]:
# CSV dosya yolunu kendine göre düzenle
INPUT_CSV = "data/airlines_delay.csv"

# Spark output bir klasör ister
OUTPUT_DIR = "outputs/mr_delay_rate_by_airline"

# İlk birkaç satırı görelim
rdd_raw = sc.textFile(INPUT_CSV)
rdd_raw.take(5)

['Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class',
 '2313.0,1296.0,141.0,DL,ATL,HOU,1,0',
 '6948.0,360.0,146.0,OO,COS,ORD,4,0',
 '1247.0,1170.0,143.0,B6,BOS,CLT,3,0',
 '31.0,1410.0,344.0,US,OGG,PHX,6,0']

In [16]:
import csv
from io import StringIO

def parse_line(line: str):
    # Beklenen kolonlar:
    # Flight, Time, Length, Airline, AirportFrom, AirportTo, DayOfWeek, Class
    try:
        row = next(csv.reader(StringIO(line)))
        if len(row) != 8:
            return None

        airline = row[3].strip()
        delayed = int(row[7].strip())  # Class: 0/1

        return (airline, (delayed, 1))
    except Exception:
        return None

header = rdd_raw.first()
rdd = rdd_raw.filter(lambda x: x != header)

kv = rdd.map(parse_line).filter(lambda x: x is not None)

kv.take(5)

[('DL', (0, 1)),
 ('OO', (0, 1)),
 ('B6', (0, 1)),
 ('US', (0, 1)),
 ('FL', (0, 1))]

In [17]:
# (Airline, (sum_delayed, sum_total))
agg = kv.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# (Airline, delayed, total, rate)
result_rdd = agg.map(lambda x: (x[0], x[1][0], x[1][1], (x[1][0] / x[1][1]) if x[1][1] else 0.0))

result_rdd.take(10)

                                                                                

[('DL', 27452, 60940, 0.450475877912701),
 ('B6', 8459, 18112, 0.46703842756183744),
 ('CO', 11957, 21118, 0.5661994507055592),
 ('YV', 3334, 13725, 0.2429143897996357),
 ('OH', 3502, 12630, 0.2772763262074426),
 ('UA', 8946, 27619, 0.32390745501285345),
 ('F9', 2899, 6456, 0.44903965303593557),
 ('OO', 22760, 50254, 0.4528992716997652),
 ('US', 11591, 34500, 0.33597101449275363),
 ('FL', 6275, 20827, 0.30129159264416383)]

In [18]:
from pyspark.sql import Row

df = result_rdd.map(lambda x: Row(Airline=x[0], delayed_count=int(x[1]), total_count=int(x[2]), delay_rate=float(x[3]))).toDF()

df.orderBy("delay_rate", ascending=False).show(10, truncate=False)

+-------+-------------+-----------+-------------------+
|Airline|delayed_count|total_count|delay_rate         |
+-------+-------------+-----------+-------------------+
|WN     |65657        |94097      |0.6977586958138942 |
|CO     |11957        |21118      |0.5661994507055592 |
|B6     |8459         |18112      |0.46703842756183744|
|OO     |22760        |50254      |0.4528992716997652 |
|DL     |27452        |60940      |0.450475877912701  |
|F9     |2899         |6456       |0.44903965303593557|
|EV     |11255        |27983      |0.4022084837222599 |
|9E     |8226         |20686      |0.39766025331141835|
|AA     |17736        |45656      |0.38847029963203084|
|XE     |11795        |31126      |0.37894364839683864|
+-------+-------------+-----------+-------------------+
only showing top 10 rows



In [19]:
total_rows = rdd.count()
total_from_mr = df.agg({"total_count": "sum"}).collect()[0][0]
delayed_from_mr = df.agg({"delayed_count": "sum"}).collect()[0][0]

print("Rows (header excluded):", total_rows)
print("Sum(total_count) from MR:", total_from_mr)
print("Sum(delayed_count) from MR:", delayed_from_mr)

                                                                                

Rows (header excluded): 539382
Sum(total_count) from MR: 539382
Sum(delayed_count) from MR: 240264


In [20]:
import os
import shutil

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

# Tab-separated text output
result_rdd.map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}\t{x[3]}") \
          .coalesce(1) \
          .saveAsTextFile(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)

Saved to: outputs/mr_delay_rate_by_airline


## Map Raduce 2

In [21]:
import csv
from io import StringIO

def parse_line_route(line: str):
    # Beklenen kolonlar:
    # Flight, Time, Length, Airline, AirportFrom, AirportTo, DayOfWeek, Class
    try:
        row = next(csv.reader(StringIO(line)))
        if len(row) != 8:
            return None

        airport_from = row[4].strip()
        airport_to   = row[5].strip()
        delayed      = int(row[7].strip())  # Class: 0/1

        # Key = route tuple, Value = (delayed, 1)
        return ((airport_from, airport_to), (delayed, 1))
    except Exception:
        return None

# rdd_raw, header, rdd daha önce oluşturmuştuk (Airline kısmında)
# Eğer bu hücreyi tek başına çalıştıracaksan:
# rdd_raw = sc.textFile(INPUT_CSV); header = rdd_raw.first(); rdd = rdd_raw.filter(lambda x: x != header)

kv_route = rdd.map(parse_line_route).filter(lambda x: x is not None)
kv_route.take(5)


[(('ATL', 'HOU'), (0, 1)),
 (('COS', 'ORD'), (0, 1)),
 (('BOS', 'CLT'), (0, 1)),
 (('OGG', 'PHX'), (0, 1)),
 (('BMI', 'ATL'), (0, 1))]

In [22]:
# ((from, to), (sum_delayed, sum_total))
agg_route = kv_route.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# ((from, to), delayed, total, rate)
result_route_rdd = agg_route.map(
    lambda x: (
        x[0][0],                 # AirportFrom
        x[0][1],                 # AirportTo
        x[1][0],                 # delayed_count
        x[1][1],                 # total_count
        (x[1][0] / x[1][1]) if x[1][1] else 0.0  # delay_rate
    )
)

result_route_rdd.take(10)

                                                                                

[('ATL', 'HOU', 149, 358, 0.41620111731843573),
 ('BOS', 'CLT', 129, 353, 0.3654390934844193),
 ('OGG', 'PHX', 4, 47, 0.0851063829787234),
 ('MSY', 'BHM', 25, 56, 0.44642857142857145),
 ('DFW', 'MEM', 70, 205, 0.34146341463414637),
 ('ATL', 'PBI', 185, 390, 0.47435897435897434),
 ('CRW', 'MCO', 7, 13, 0.5384615384615384),
 ('LGB', 'SFO', 54, 112, 0.48214285714285715),
 ('CLT', 'MEM', 86, 206, 0.4174757281553398),
 ('IAH', 'CLT', 151, 400, 0.3775)]

In [23]:
from pyspark.sql import Row

df_route = result_route_rdd.map(
    lambda x: Row(
        AirportFrom=x[0],
        AirportTo=x[1],
        delayed_count=int(x[2]),
        total_count=int(x[3]),
        delay_rate=float(x[4])
    )
).toDF()

# İstersen "çok az uçuş olan rotalar" top listeleri bozmasın diye min uçuş filtresi ekleyebilirsin.
MIN_FLIGHTS = 50  # istersen 20/100 diye oynarsın

df_route_filtered = df_route.filter(df_route.total_count >= MIN_FLIGHTS)

df_route_filtered.orderBy("delay_rate", ascending=False).show(20, truncate=False)

+-----------+---------+-------------+-----------+------------------+
|AirportFrom|AirportTo|delayed_count|total_count|delay_rate        |
+-----------+---------+-------------+-----------+------------------+
|LAS        |PIT      |58           |61         |0.9508196721311475|
|MDW        |SLC      |58           |62         |0.9354838709677419|
|LAS        |TPA      |58           |62         |0.9354838709677419|
|MDW        |SEA      |56           |62         |0.9032258064516129|
|STL        |TPA      |63           |70         |0.9               |
|LAX        |HOU      |105          |118        |0.8898305084745762|
|MDW        |ABQ      |55           |62         |0.8870967741935484|
|PHL        |STL      |54           |61         |0.8852459016393442|
|BNA        |BHM      |53           |60         |0.8833333333333333|
|LAS        |BNA      |74           |84         |0.8809523809523809|
|DAL        |BHM      |77           |88         |0.875             |
|MDW        |RDU      |84         

In [24]:
df_route_filtered.orderBy("total_count", ascending=False).show(20, truncate=False)

+-----------+---------+-------------+-----------+-------------------+
|AirportFrom|AirportTo|delayed_count|total_count|delay_rate         |
+-----------+---------+-------------+-----------+-------------------+
|LAX        |SFO      |598          |1079       |0.5542168674698795 |
|SFO        |LAX      |572          |1077       |0.531104921077066  |
|OGG        |HNL      |220          |982        |0.2240325865580448 |
|HNL        |OGG      |268          |951        |0.28180862250262884|
|LAX        |SAN      |384          |935        |0.4106951871657754 |
|SAN        |LAX      |351          |935        |0.3754010695187166 |
|LAX        |LAS      |482          |928        |0.5193965517241379 |
|LAS        |LAX      |447          |928        |0.4816810344827586 |
|LGA        |ATL      |344          |916        |0.37554585152838427|
|ATL        |LGA      |423          |915        |0.46229508196721314|
|LGA        |ORD      |329          |874        |0.37643020594965676|
|ORD        |LGA    

In [25]:
total_rows = rdd.count()
total_from_mr_route = df_route.agg({"total_count": "sum"}).collect()[0][0]
delayed_from_mr_route = df_route.agg({"delayed_count": "sum"}).collect()[0][0]

print("Rows (header excluded):", total_rows)
print("Sum(total_count) from Route MR:", total_from_mr_route)
print("Sum(delayed_count) from Route MR:", delayed_from_mr_route)

Rows (header excluded): 539382
Sum(total_count) from Route MR: 539382
Sum(delayed_count) from Route MR: 240264


In [None]:
import os
import shutil

OUTPUT_DIR_ROUTE = "outputs/mr_delay_rate_by_route"

if os.path.exists(OUTPUT_DIR_ROUTE):
    shutil.rmtree(OUTPUT_DIR_ROUTE)

# Tab-separated text output
(
    df_route_filtered
    .rdd
    .map(lambda r: f"{r['AirportFrom']}\t{r['AirportTo']}\t{r['delayed_count']}\t{r['total_count']}\t{r['delay_rate']}")
    .coalesce(1)
    .saveAsTextFile(OUTPUT_DIR_ROUTE)
)

print("Saved to:", OUTPUT_DIR_ROUTE)