In [None]:
from pyspark.sql import functions as F

INPUT_CSV = "data/airlines_delay.csv"

df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv(INPUT_CSV))

df.printSchema()
df.show(5, truncate=False)


In [None]:
df.select("Class").groupBy("Class").count().orderBy("Class").show()

# null var mı?
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show(truncate=False)

In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print("train:", train_df.count())
print("test :", test_df.count())


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression

label_col = "Class"

cat_cols = ["Airline", "AirportFrom", "AirportTo"]
num_cols = ["Time", "Length", "DayOfWeek"]  # Flight id gibi kolonlar varsa ekleme (gürültü olur)

# Index + OHE
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep") for c in cat_cols]
encoder = OneHotEncoder(
    inputCols=[f"{c}_idx" for c in cat_cols],
    outputCols=[f"{c}_ohe" for c in cat_cols],
    handleInvalid="keep"
)

assembler = VectorAssembler(
    inputCols=num_cols + [f"{c}_ohe" for c in cat_cols],
    outputCol="features"
)

lr = LogisticRegression(featuresCol="features", labelCol=label_col, maxIter=50, regParam=0.0)

pipeline_lr = Pipeline(stages=indexers + [encoder, assembler, lr])

model_lr = pipeline_lr.fit(train_df)
pred_lr = model_lr.transform(test_df)

pred_lr.select("features", "Class", "probability", "prediction").show(5, truncate=False)


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")
f1_eval  = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")
auc_eval = BinaryClassificationEvaluator(labelCol=label_col, rawPredictionCol="rawPrediction", metricName="areaUnderROC")

acc = acc_eval.evaluate(pred_lr)
f1  = f1_eval.evaluate(pred_lr)
auc = auc_eval.evaluate(pred_lr)

print("LogReg -> accuracy:", acc)
print("LogReg -> f1      :", f1)
print("LogReg -> AUC     :", auc)

# Confusion matrix
cm = (pred_lr
      .groupBy("Class")
      .pivot("prediction", [0.0, 1.0])
      .count()
      .na.fill(0)
     )
cm.show()


In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import matplotlib.pyplot as plt

# (score, label) -> score: P(Class=1)
score_and_labels = pred_lr.select(
    F.col("probability").getItem(1).alias("score"),
    F.col("Class").cast("double").alias("label")
).rdd.map(lambda r: (float(r["score"]), float(r["label"])))

metrics = BinaryClassificationMetrics(score_and_labels)
roc = metrics.roc().collect()  # [(FPR, TPR), ...]

fpr = [p[0] for p in roc]
tpr = [p[1] for p in roc]

plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Logistic Regression)")
plt.grid(True)
plt.show()


In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol=label_col, featuresCol="features", maxIter=50, maxDepth=5)

pipeline_gbt = Pipeline(stages=indexers + [encoder, assembler, gbt])
model_gbt = pipeline_gbt.fit(train_df)
pred_gbt = model_gbt.transform(test_df)

acc_gbt = acc_eval.evaluate(pred_gbt)
f1_gbt  = f1_eval.evaluate(pred_gbt)
auc_gbt = auc_eval.evaluate(pred_gbt)

print("GBT -> accuracy:", acc_gbt)
print("GBT -> f1      :", f1_gbt)
print("GBT -> AUC     :", auc_gbt)


### ML #2 — Route Clustering (KMeans, Unsupervised)

Bu ikinci ML uygulaması: rotaları “gecikme profiline” göre kümeliyoruz.

In [None]:
route_df = (df
    .groupBy("AirportFrom", "AirportTo")
    .agg(
        F.count("*").alias("flight_count"),
        F.avg("Class").alias("delay_rate"),
        F.avg("Length").alias("avg_length"),
        F.avg("Time").alias("avg_time")
    )
    .filter(F.col("flight_count") >= 50)   # az örnekli rotaları ele (istersen 20/100 yap)
)

route_df.show(5, truncate=False)
print("Routes:", route_df.count())


In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

route_assembler = VectorAssembler(
    inputCols=["flight_count", "delay_rate", "avg_length", "avg_time"],
    outputCol="route_features"
)

route_vec = route_assembler.transform(route_df)

scaler = StandardScaler(inputCol="route_features", outputCol="route_features_scaled", withStd=True, withMean=False)
scaler_model = scaler.fit(route_vec)
route_vec = scaler_model.transform(route_vec)

evaluator = ClusteringEvaluator(featuresCol="route_features_scaled", metricName="silhouette", distanceMeasure="squaredEuclidean")

ks = list(range(2, 11))
sil_scores = []

for k in ks:
    km = KMeans(k=k, seed=42, featuresCol="route_features_scaled")
    m = km.fit(route_vec)
    p = m.transform(route_vec)
    s = evaluator.evaluate(p)
    sil_scores.append(s)
    print("k:", k, "silhouette:", s)

import matplotlib.pyplot as plt
plt.figure()
plt.plot(ks, sil_scores, marker="o")
plt.xlabel("k")
plt.ylabel("silhouette")
plt.title("KMeans Silhouette vs k (Routes)")
plt.grid(True)
plt.show()


In [None]:
best_k = ks[sil_scores.index(max(sil_scores))]
print("Best k:", best_k)

kmeans = KMeans(k=best_k, seed=42, featuresCol="route_features_scaled")
kmeans_model = kmeans.fit(route_vec)
route_clustered = kmeans_model.transform(route_vec)

route_clustered.select("AirportFrom", "AirportTo", "flight_count", "delay_rate", "avg_length", "avg_time", "prediction") \
               .orderBy(F.desc("delay_rate")) \
               .show(20, truncate=False)

# Küme özetleri (raporda çok iyi durur)
cluster_summary = (route_clustered
    .groupBy("prediction")
    .agg(
        F.count("*").alias("num_routes"),
        F.avg("delay_rate").alias("avg_delay_rate"),
        F.avg("flight_count").alias("avg_flight_count"),
        F.avg("avg_length").alias("avg_length"),
        F.avg("avg_time").alias("avg_time"),
    )
    .orderBy("prediction")
)

cluster_summary.show(truncate=False)
