In [3]:
import os
os.environ["PYSPARK_PYTHON"] = "D:\\python39venv\\Scripts\\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = "D:\\python39venv\\Scripts\\python.exe"
os.environ["HADOOP_HOME"] = "D:\\hadoop-3.3.6"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
import shutil

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("FlightDelayModelTraining") \
    .master("local[*]") \
    .config("spark.hadoop.fs.defaultFS", "file:///") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

print("Spark version:", spark.version)
print("Hadoop home:", os.environ.get("HADOOP_HOME"))

# Load synthetic flight data
try:
    df = spark.read.json("D:/flight_data_sample.json")
    print("Data loaded successfully, rows:", df.count())
except Exception as e:
    print("Error loading data:", str(e))
    spark.stop()
    raise e

# Preprocess data
df = df.withColumn("is_delayed", when(col("delay_minutes") > 15, 1).otherwise(0))

# Define features
feature_cols = ["temperature", "wind_speed", "precipitation", "delay_minutes"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Define classifier
rf = RandomForestClassifier(
    labelCol="is_delayed",
    featuresCol="features",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Create pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Split data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Train model
try:
    model = pipeline.fit(train_df)
    print("Model trained successfully")
except Exception as e:
    print("Error training model:", str(e))
    spark.stop()
    raise e

# Evaluate on test set
predictions = model.transform(test_df)
correct = predictions.filter(col("is_delayed") == col("prediction")).count()
total = predictions.count()
accuracy = correct / total if total > 0 else 0
print(f"Test accuracy: {accuracy:.2%} ({correct}/{total} correct)")

# Save model
model_path = "D:/flight_delay_model"
try:
    # Remove existing model directory if it exists
    if os.path.exists(model_path):
        shutil.rmtree(model_path)
    model.save(model_path)
    print(f"Model saved to {model_path}")
except Exception as e:
    print("Error saving model:", str(e))
    spark.stop()
    raise e

# Clean up
spark.stop()
print("Spark session stopped")

Spark version: 3.5.5
Hadoop home: D:\hadoop-3.3.6
Data loaded successfully, rows: 1000
Model trained successfully
Test accuracy: 100.00% (162/162 correct)
Model saved to D:/flight_delay_model
Spark session stopped
