In [9]:
from pyspark.sql import SparkSession


# .config("spark.sql.shuffle.partitions", 20) \  # Optimize for parallelism
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()


sc = spark.sparkContext

trainFileName = './train.csv'
testFileName = './test.csv'

# deerfootRDD = (sc.textFile(fileName, 8))
# print('\n'.join(deerfootRDD.zipWithIndex().map(lambda x: str(x[1]) + ': ' + str(x[0])).take(15)))

# Load a CSV file
trainDF = spark.read.csv(trainFileName, header=True, inferSchema=True)
testDF = spark.read.csv(testFileName, header=True, inferSchema=True)

print(trainDF.head())
print(testDF.head())




Row(MONTH=7, DAY_OF_WEEK=7, DEP_DEL15=0, DEP_TIME_BLK='1500-1559', DISTANCE_GROUP=3, SEGMENT_NUMBER=3, CONCURRENT_FLIGHTS=26, NUMBER_OF_SEATS=160, CARRIER_NAME='American Airlines Inc.', AIRPORT_FLIGHTS_MONTH=19534, AIRLINE_FLIGHTS_MONTH=79247, AIRLINE_AIRPORT_FLIGHTS_MONTH=7972, AVG_MONTHLY_PASS_AIRPORT=2006675, AVG_MONTHLY_PASS_AIRLINE=11744595, FLT_ATTENDANTS_PER_PASS=9.82082928995461e-05, GROUND_SERV_PER_PASS=0.00017728721959309724, PLANE_AGE=6, DEPARTING_AIRPORT='Douglas Municipal', LATITUDE=35.219, LONGITUDE=-80.936, PREVIOUS_AIRPORT="Chicago O'Hare International", PRCP=0.0, SNOW=0.0, SNWD=0.0, TMAX=95.0, AWND=4.25, CARRIER_HISTORICAL=0.2377091471954006, DEP_AIRPORT_HIST=0.2732357375614027, DAY_HISTORICAL=0.22253848170480506, DEP_BLOCK_HIST=0.25547875927174646)
Row(MONTH=4, DAY_OF_WEEK=6, DEP_DEL15=0, DEP_TIME_BLK='1000-1059', DISTANCE_GROUP=3, SEGMENT_NUMBER=3, CONCURRENT_FLIGHTS=5, NUMBER_OF_SEATS=110, CARRIER_NAME='Delta Air Lines Inc.', AIRPORT_FLIGHTS_MONTH=6837, AIRLINE_FLIG

                                                                                

In [10]:

from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Prepare data
target_column = "DEP_DEL15"
label_column = "label"

numeric_features = [
    "MONTH",
    "DAY_OF_WEEK",
    "DISTANCE_GROUP",
    "SEGMENT_NUMBER",
    "CONCURRENT_FLIGHTS",
    "NUMBER_OF_SEATS",
    "AIRPORT_FLIGHTS_MONTH",
    "AIRLINE_FLIGHTS_MONTH",
    "AIRLINE_AIRPORT_FLIGHTS_MONTH",
    "AVG_MONTHLY_PASS_AIRPORT",
    "AVG_MONTHLY_PASS_AIRLINE",
    "FLT_ATTENDANTS_PER_PASS",
    "GROUND_SERV_PER_PASS",
    "PLANE_AGE",
    "LATITUDE",
    "LONGITUDE",
    "PRCP",
    "SNOW",
    "SNWD",
    "TMAX",
    "AWND",
    "CARRIER_HISTORICAL",
    "DEP_AIRPORT_HIST",
    "DAY_HISTORICAL",
    "DEP_BLOCK_HIST"
]

string_features = ['DEP_TIME_BLK', 'CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT']

def one_hot_name(for_string_feature):
    return for_string_feature + "_one_hot"
def indexer_name(for_string_feature):
    return for_string_feature + "_index"

string_feature_indexers = list(map(lambda string_feature: StringIndexer(inputCol=string_feature, outputCol=indexer_name(string_feature), handleInvalid="keep"), string_features))
one_hot_encoders = list(map(lambda string_feature: OneHotEncoder(inputCol=indexer_name(string_feature), outputCol=one_hot_name(string_feature)), string_features))
label_indexer = StringIndexer(inputCol=target_column, outputCol=label_column)
one_hot_column_names = list(map(lambda string_feature: one_hot_name(string_feature), string_features))
assembler = VectorAssembler(
    inputCols= one_hot_column_names + numeric_features,
    outputCol="features"
)
classifier = RandomForestClassifier(featuresCol="features", labelCol=label_column)
pipeline = Pipeline(stages=string_feature_indexers + one_hot_encoders + [label_indexer, assembler, classifier])
model = pipeline.fit(trainDF)
predictions = model.transform(testDF)
evaluator = MulticlassClassificationEvaluator(
    labelCol=label_column, predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")


24/12/02 14:01:34 WARN MemoryStore: Not enough space to cache rdd_217_0 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:34 WARN BlockManager: Persisting block rdd_217_0 to disk instead.
24/12/02 14:01:39 WARN MemoryStore: Not enough space to cache rdd_217_0 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:42 WARN MemoryStore: Not enough space to cache rdd_217_1 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:42 WARN BlockManager: Persisting block rdd_217_1 to disk instead.
24/12/02 14:01:47 WARN MemoryStore: Not enough space to cache rdd_217_1 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:50 WARN MemoryStore: Not enough space to cache rdd_217_2 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:50 WARN BlockManager: Persisting block rdd_217_2 to disk instead.
24/12/02 14:01:55 WARN MemoryStore: Not enough space to cache rdd_217_2 in memory! (computed 347.9 MiB so far)
24/12/02 14:01:58 WARN MemoryStore: Not enough space to cache rdd_217_3 in memory! (compute

Test Accuracy: 0.81


                                                                                