<img src="https://nycwheelchairtransportation.com/wp-content/uploads/2020/07/plane_clipart_delay-removebg-preview.png" style="display: block; margin: auto; width: 25%;" height="200" title= "Flight Delays">

# 1.Data Acquisition

## 1.1 Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnull, when, count, col
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.storagelevel import StorageLevel
from pyspark.ml.classification import LogisticRegression, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from prettytable import PrettyTable

import plotly.graph_objs as go
import matplotlib.pyplot as plt
import pandas as pd

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## 1.2 Loading Data

In [2]:
# Create a SparkSession with appropriate settings
spark = SparkSession.builder \
        .appName("AirlineDelays") \
        .config('spark.master', 'local[*]') \
        .config("spark.default.parallelism", "16") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
        
# read the CSV file into a PySpark DataFrame with explicit schema
schema = StructType([
    StructField('YEAR', IntegerType(), True),
    StructField('MONTH', IntegerType(), True),
    StructField('DAY', IntegerType(), True),
    StructField('DAY_OF_WEEK', IntegerType(), True),
    StructField('AIRLINE', StringType(), True),
    StructField('FLIGHT_NUMBER', IntegerType(), True),
    StructField('TAIL_NUMBER', StringType(), True),
    StructField('ORIGIN_AIRPORT', StringType(), True),
    StructField('DESTINATION_AIRPORT', StringType(), True),
    StructField('SCHEDULED_DEPARTURE', IntegerType(), True),
    StructField('DEPARTURE_TIME', IntegerType(), True),
    StructField('DEPARTURE_DELAY', IntegerType(), True),
    StructField('TAXI_OUT', IntegerType(), True),
    StructField('WHEELS_OFF', IntegerType(), True),
    StructField('SCHEDULED_TIME', IntegerType(), True),
    StructField('ELAPSED_TIME', IntegerType(), True),
    StructField('AIR_TIME', IntegerType(), True),
    StructField('DISTANCE', IntegerType(), True),
    StructField('WHEELS_ON', IntegerType(), True),
    StructField('TAXI_IN', IntegerType(), True),
    StructField('SCHEDULED_ARRIVAL', IntegerType(), True),
    StructField('ARRIVAL_TIME', IntegerType(), True),
    StructField('ARRIVAL_DELAY', IntegerType(), True),
    StructField('DIVERTED', IntegerType(), True),
    StructField('CANCELLED', IntegerType(), True),
    StructField('CANCELLATION_REASON', StringType(), True),
    StructField('AIR_SYSTEM_DELAY', IntegerType(), True),
    StructField('SECURITY_DELAY', IntegerType(), True),
    StructField('AIRLINE_DELAY', IntegerType(), True),
    StructField('LATE_AIRCRAFT_DELAY', IntegerType(), True),
    StructField('WEATHER_DELAY', IntegerType(), True)
])

# Read the CSV file into a PySpark DataFrame
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .schema(schema) \
        .option("numPartitions", "16") \
        .load("flights.csv")

# Shuffle the DataFrame
df = df.orderBy(F.rand())

# persist for faster access
df= df.persist(StorageLevel.MEMORY_AND_DISK)

# show a sample of 10 rows
df.show(10)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

# 2.Data Pre-processing

## 2.1 Data Exploration and Analysis

In [3]:
# Group by airline and calculate the number of delayed and non-delayed flights
airline_stats = df.groupBy('AIRLINE') \
                  .agg(F.sum(F.when(F.col('DEPARTURE_DELAY') > 0, 1).otherwise(0)).alias('Delayed'),
                       F.sum(F.when(F.col('DEPARTURE_DELAY') <= 0, 1).otherwise(0)).alias('Non_Delayed'))

# Convert to a Pandas DataFrame
airline_stats_pd = airline_stats.toPandas()

# Define colors for delayed flights and on-time flights
colors = ['limegreen', 'orangered'] 

# Create a stacked bar chart using plotly.graph_objs
data = [go.Bar(x=airline_stats_pd['AIRLINE'], y=airline_stats_pd['Non_Delayed'],
                name='Non-Delayed Flights', marker=dict(color=colors[0])),
        go.Bar(x=airline_stats_pd['AIRLINE'], y=airline_stats_pd['Delayed'],
                name='Delayed Flights', marker=dict(color=colors[1]))]

layout = go.Layout(title='Delayed and Non-Delayed Flights per Airline',
                   xaxis=dict(title='Airline'),
                   yaxis=dict(title='Number of Flights'),
                   barmode='stack',
                   width=750,
                   height= 500,
                   legend=dict(x=0.25, y=1.1, orientation='h')
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [4]:
# Calculate the number of delayed and non-delayed flights
delayed_flights = df.filter(F.col('DEPARTURE_DELAY') > 0).count()
non_delayed_flights = df.filter(F.col('DEPARTURE_DELAY') <= 0).count()

# Define the labels, sizes, and colors for the pie chart
labels = ['Delayed Flights', 'Non-Delayed Flights']
sizes = [delayed_flights, non_delayed_flights]
colors = ['#FF5733', '#8BC34A']

# Create the 3D pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=sizes, pull=[0.1, 0], marker=dict(colors=colors),
                            textposition= 'outside', textinfo='label+percent', insidetextorientation='radial')])
fig.update_layout(title={'text': 'Percentage of Delayed and Non-Delayed Flights', 'x':0.5, 'y':0.95},
                  width=750, height=500, showlegend=True, legend=dict(x=0.15, y=1.1, orientation='h'))

fig.show()

In [5]:
# Count the number of null values in each column
null_counts = [df.filter(F.col(c).isNull()).count() for c in df.columns]

# Create a bar chart using plotly.graph_objs
fig = go.Figure(
    go.Bar(x=df.columns, y=null_counts, marker_color='dimgray')
)

# Add x-axis and y-axis labels
fig.update_layout(
    title='Null Value Counts for Each Attribute',
    xaxis_title='Attribute',
    yaxis_title='Number of Null Values',
    width=750,
    height= 500
)

# Rotate x-axis tick labels by 90 degrees
fig.update_layout(xaxis_tickangle=-90)

fig.show()

In [6]:
# Group by origin airport and month, and count the number of flights
airport_counts = df.groupBy('ORIGIN_AIRPORT', 'MONTH') \
                      .count() \
                      .orderBy('MONTH', 'ORIGIN_AIRPORT')

# Convert to a Pandas DataFrame
airport_counts_pd = airport_counts.toPandas()

# Create a scatter plot using plotly.graph_objs
data = go.Scatter(
        x=airport_counts_pd['ORIGIN_AIRPORT'],
        y=airport_counts_pd['MONTH'],
        mode='markers',
        marker=dict(
            size=(airport_counts_pd['count']/30)**0.5,
            sizemode='diameter',
            color=airport_counts_pd['count'],
            colorscale='Viridis',
            opacity=0.8,
            showscale=True,
        )
)

# Add a title and labels to the plot
layout = go.Layout(
    title='Number of Flights by Origin Airport and Month',
    xaxis=dict(title='Origin Airport'),
    yaxis=dict(title='Month', tickvals=list(range(airport_counts_pd['MONTH'].min(),
                                             airport_counts_pd['MONTH'].max()+1))),
    width=750,
    height=500
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [7]:
# Group flights by month and count number of flights
flights_per_month = df.groupBy('MONTH') \
                      .count() \
                      .orderBy('MONTH')

# Group delayed flights by month and count number of delayed flights
delayed_flights_per_month = df.filter(col('DEPARTURE_DELAY') > 0) \
                               .groupBy('MONTH') \
                               .count() \
                               .orderBy('MONTH')

# Convert to Pandas dataframes for visualization
flights_per_month_pd = flights_per_month.toPandas()
delayed_flights_per_month_pd = delayed_flights_per_month.toPandas()

# Create line plots
data = [
    go.Scatter(
        x=flights_per_month_pd['MONTH'],
        y=flights_per_month_pd['count'],
        mode='lines',
        name='Flights',
        line=dict(color='blue')
    ),
    go.Scatter(
        x=delayed_flights_per_month_pd['MONTH'],
        y=delayed_flights_per_month_pd['count'],
        mode='lines',
        name='Delayed Flights',
        line=dict(color='orangered')
    )
]

# Add a title and labels to the plot
layout = go.Layout(
    title='Number of Flights and Delayed Flights by Month',
    xaxis=dict(title='Month', dtick=1, range=[flights_per_month_pd['MONTH'].min()-0.5,
                                               flights_per_month_pd['MONTH'].max()+0.5]),
    yaxis=dict(title='Number of Flights'),
    width=750,
    height=500,
    legend=dict(x=0.31, y=1.1, orientation='h')
)

fig = go.Figure(data=data, layout=layout)
fig.show()

## 2.2 Data Cleaning

- Selection of Significant Attributes

In [8]:
# Select a subset of columns from the `df` DataFrame
df_agg = df.select('MONTH','DAY','DAY_OF_WEEK','AIRLINE','ORIGIN_AIRPORT',
                                      'SCHEDULED_DEPARTURE','SCHEDULED_TIME',
                                      'DISTANCE','SCHEDULED_ARRIVAL','DEPARTURE_DELAY')

# Remove any rows with missing values from `df_agg`
df_agg = df_agg.dropna(how="any")

In [9]:
# Unpersist to free up resources
df.unpersist();

In [10]:
# Persist `df_agg` in memory and on disk for faster access
df_agg = df_agg.persist(StorageLevel.MEMORY_AND_DISK)

- Duplicates

In [11]:
# Check for duplicates
df_duplicates = df_agg.dropDuplicates()

# Count the number of duplicates
num_duplicates = df_agg.count() - df_duplicates.count()

# Print the number of duplicates
print("Number of duplicates:", num_duplicates)

Number of duplicates: 38


## 2.3 Data Preparation

- Create a new column 'Delay' which gives 0 value for early or exact arrivals and value 1 for late arrivals

In [12]:
df_agg = df_agg.withColumn('DELAY', when(df_agg['DEPARTURE_DELAY'] <= 0, 0).otherwise(1))
df_agg.show(10)

+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|MONTH|DAY|DAY_OF_WEEK|AIRLINE|ORIGIN_AIRPORT|SCHEDULED_DEPARTURE|SCHEDULED_TIME|DISTANCE|SCHEDULED_ARRIVAL|DEPARTURE_DELAY|DELAY|
+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|    6| 13|          6|     OO|           ORD|               1556|           149|     802|             1825|              0|    0|
|    7|  9|          4|     DL|           ATL|               2311|            55|     151|             2306|             -2|    0|
|    6| 16|          2|     VX|           LAX|               1950|            75|     337|             2105|             74|    1|
|   10| 29|          4|     OO|         13930|               2050|            48|      84|             2238|             13|    1|
|    7| 14|          2|     DL|           PDX|                 45|           191|  

- Data Balance

In [13]:
# Check the percentage of delayed and not delayed flights
isNot_delay = df_agg.filter(col('DELAY') == 0).count()
total = df_agg.count()
isNot_delay_perc = float(isNot_delay)/total
delay_perc = 1 - isNot_delay_perc
print("Not Delayed Flights Percentage: {:.2%}".format(isNot_delay_perc))
print("Delayed Flights Percentage: {:.2%}".format(delay_perc))

Not Delayed Flights Percentage: 62.92%
Delayed Flights Percentage: 37.08%


- Under-sampling

In [14]:
total = df_agg.count()
isNot_delay = df_agg.filter(col('DELAY') == 0).count()
delay = total - isNot_delay

# undersample isNot_delay rows to balance the number of delay rows
isNot_delay_df = df_agg.filter(col('DELAY') == 0)
isNot_delay_sample = isNot_delay_df.sample(fraction=float(delay)/isNot_delay, seed=5)

isNot_delay_sample.show(10)

+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|MONTH|DAY|DAY_OF_WEEK|AIRLINE|ORIGIN_AIRPORT|SCHEDULED_DEPARTURE|SCHEDULED_TIME|DISTANCE|SCHEDULED_ARRIVAL|DEPARTURE_DELAY|DELAY|
+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|    6| 13|          6|     OO|           ORD|               1556|           149|     802|             1825|              0|    0|
|    7|  9|          4|     DL|           ATL|               2311|            55|     151|             2306|             -2|    0|
|    9| 30|          3|     AA|           MCO|                520|            94|     468|              654|             -7|    0|
|    1| 23|          5|     MQ|           GSO|               1028|            92|     461|             1200|             -4|    0|
|    3|  5|          4|     DL|           MSP|               2155|            56|  

- Balanced Sample

In [15]:
# create balanced data of delay and isNotdelay
delay_df = df_agg.filter(col('DELAY') == 1)
B_df_agg = isNot_delay_sample.union(delay_df)

In [16]:
# Remove `df_agg` from memory and disk to free up resources
df_agg.unpersist();

- Data Balance after Under-sampling

In [17]:
# Check the percentage of delayed and not delayed flights in the new balanced data frame
isNot_delay_new = B_df_agg.filter(col('DELAY') == 0).count()
total_new = B_df_agg.count()
isNot_delay_perc_new= float(isNot_delay_new)/total_new
delay_perc_new = 1 - isNot_delay_perc_new
print("Not Delayed Flights Percentage: {:.2%}".format(isNot_delay_perc_new))
print("Delayed Flights Percentage: {:.2%}".format(delay_perc_new))

Not Delayed Flights Percentage: 50.00%
Delayed Flights Percentage: 50.00%


In [18]:
# persist for faster access
B_df_agg.persist(StorageLevel.MEMORY_AND_DISK);

In [19]:
# Count the month of October
B_df_agg.filter(col("MONTH") == 10).count()

344922

- Drop incorrect IATA codes values

In [20]:
B_df_agg = B_df_agg.filter(col("MONTH") != 10)

- Convert ORIGIN_AIRPORT and AIRLINE into unique integers

In [21]:
# Define StringIndexer for AIRLINE column
airline_indexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_ID")
# Fit StringIndexer to data
airline_indexer_model = airline_indexer.fit(B_df_agg)
# Transform data using StringIndexer
B_df_agg = airline_indexer_model.transform(B_df_agg)

# Define StringIndexer for ORIGIN_AIRPORT column
airport_indexer = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="ORIGIN_AIRPORT_ID")
# Fit StringIndexer to data
airport_indexer_model = airport_indexer.fit(B_df_agg)
# Transform data using StringIndexer
B_df_agg = airport_indexer_model.transform(B_df_agg)

# Cast the output columns to integer data type
B_df_agg = B_df_agg.withColumn("AIRLINE_ID", B_df_agg["AIRLINE_ID"].cast("integer"))
B_df_agg = B_df_agg.withColumn("ORIGIN_AIRPORT_ID", B_df_agg["ORIGIN_AIRPORT_ID"].cast("integer"))

# Replace original columns with new columns
B_df_agg = B_df_agg \
    .withColumn("AIRLINE", B_df_agg["AIRLINE_ID"].cast("integer")) \
    .withColumn("ORIGIN_AIRPORT", B_df_agg["ORIGIN_AIRPORT_ID"].cast("integer")) \
    .drop("AIRLINE_ID", "ORIGIN_AIRPORT_ID")

B_df_agg.show(10)

+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|MONTH|DAY|DAY_OF_WEEK|AIRLINE|ORIGIN_AIRPORT|SCHEDULED_DEPARTURE|SCHEDULED_TIME|DISTANCE|SCHEDULED_ARRIVAL|DEPARTURE_DELAY|DELAY|
+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+
|    6| 13|          6|      3|             1|               1556|           149|     802|             1825|              0|    0|
|    7|  9|          4|      1|             0|               2311|            55|     151|             2306|             -2|    0|
|    9| 30|          3|      2|             9|                520|            94|     468|              654|             -7|    0|
|    1| 23|          5|      6|            97|               1028|            92|     461|             1200|             -4|    0|
|    3|  5|          4|      1|            11|               2155|            56|  

# 3.Feature Engineering

- One Hot Encoding

In [22]:
# Define a list of column names to select from the `B_df_agg` DataFrame
selected_columns = ['MONTH','DAY','DAY_OF_WEEK','AIRLINE','ORIGIN_AIRPORT']

# Create a list of OneHotEncoder transformers, one for each column in `selected_columns`
encoders = [OneHotEncoder(inputCol=column, outputCol=column+"_vec") for column in selected_columns]

# Create a Pipeline with the encoders as stages
pipeline_SC = Pipeline(stages=encoders)

# Transform the `B_df_agg` DataFrame by fitting the pipeline to it and applying the transformations
B_df_agg2 = pipeline_SC.fit(B_df_agg).transform(B_df_agg)

# Show the first 5 rows of the transformed DataFrame
B_df_agg2.show(5)

+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+--------------+---------------+---------------+--------------+------------------+
|MONTH|DAY|DAY_OF_WEEK|AIRLINE|ORIGIN_AIRPORT|SCHEDULED_DEPARTURE|SCHEDULED_TIME|DISTANCE|SCHEDULED_ARRIVAL|DEPARTURE_DELAY|DELAY|     MONTH_vec|        DAY_vec|DAY_OF_WEEK_vec|   AIRLINE_vec|ORIGIN_AIRPORT_vec|
+-----+---+-----------+-------+--------------+-------------------+--------------+--------+-----------------+---------------+-----+--------------+---------------+---------------+--------------+------------------+
|    6| 13|          6|      3|             1|               1556|           149|     802|             1825|              0|    0|(12,[6],[1.0])|(31,[13],[1.0])|  (7,[6],[1.0])|(13,[3],[1.0])|   (321,[1],[1.0])|
|    7|  9|          4|      1|             0|               2311|            55|     151|             2306|             -2|    0|(12,[7],[1.0])| (31,[9

In [23]:
# Free up memory and disk
B_df_agg.unpersist();

- Vector Assembling

In [24]:
# Create a VectorAssembler with the input columns to include in the features vector and the output column name
vector_assembler = VectorAssembler(inputCols=['MONTH_vec', 'DAY_vec', 'DAY_OF_WEEK_vec','AIRLINE_vec','SCHEDULED_DEPARTURE',
                                        'ORIGIN_AIRPORT_vec', 'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL'], outputCol="FEATURES")

# Transform the `B_df_agg2` DataFrame by applying the VectorAssembler to create a new column "FEATURES"
converted = vector_assembler.transform(B_df_agg2)

# Select only the "DELAY" and "FEATURES" columns from the transformed DataFrame
converted = converted.select(['DELAY', 'FEATURES'])

# Show the first 10 rows of the transformed DataFrame
converted.show(10)

+-----+--------------------+
|DELAY|            FEATURES|
+-----+--------------------+
|    0|(388,[6,25,49,53,...|
|    0|(388,[7,21,47,51,...|
|    0|(388,[9,42,46,52,...|
|    0|(388,[1,35,48,56,...|
|    0|(388,[3,17,47,51,...|
|    0|(388,[8,31,46,63,...|
|    0|(388,[6,29,46,59,...|
|    0|(388,[7,19,45,54,...|
|    0|(388,[1,21,48,54,...|
|    0|(388,[5,26,47,51,...|
+-----+--------------------+
only showing top 10 rows



- Split data into training and testing

In [25]:
# Split data into training and testing sets
train_data, test_data = converted.randomSplit([0.8, 0.2], seed=42)

In [26]:
# Persist the `train_data` DataFrame in memory and on disk with serialized values for faster access
train_data.persist(StorageLevel.MEMORY_AND_DISK_DESER)

# Persist the `test_data` DataFrame in memory and on disk with serialized values for faster access
test_data.persist(StorageLevel.MEMORY_AND_DISK_DESER)

DataFrame[DELAY: int, FEATURES: vector]

# 4.Models

## 4.1 Logistic Regression Model

- Create Model

In [27]:
# Create logistic regression model
LR = LogisticRegression(featuresCol='FEATURES', labelCol='DELAY')

# Define the pipeline
LR_pipeline = Pipeline(stages=[LR])

# Fit the model on training data
LR_model = LR_pipeline.fit(train_data)

- Predict

In [28]:
# Make predictions on test data
LR_predictions = LR_model.transform(test_data)

## 4.2 Gradient-Boosted Tree

- Create Model

In [29]:
# Create a Gradient-Boosted Tree classifier with the label column set to "DELAY" and the features column set to "FEATURES"
GBT = GBTClassifier(labelCol="DELAY", featuresCol="FEATURES")

# Define the pipeline
GBT_pipeline = Pipeline(stages=[GBT])

# Train the GBT model on the `train_data` DataFrame
GBT_model = GBT_pipeline.fit(train_data)

- Predict

In [30]:
# Use the trained GBT model to make predictions on the `test_data` DataFrame
GBT_predictions = GBT_model.transform(test_data)

## 4.3 SVM

- Create Model

In [31]:
# Define the SVM model with a linear kernel
SVM = LinearSVC(maxIter=10, regParam=0.1, featuresCol="FEATURES", labelCol="DELAY")

# Define the pipeline
SVM_pipeline = Pipeline(stages=[SVM])

# Fit the model to the training data
SVM_model = SVM_pipeline.fit(train_data)

- Predict

In [32]:
# Make predictions on the test data
SVM_predictions = SVM_model.transform(test_data)

# 5.Model Evaluation

- Accuracy Checking

In [36]:
# Define the evaluators
binary_evaluator = BinaryClassificationEvaluator(labelCol="DELAY", rawPredictionCol= "rawPrediction")

# Evaluate the models and store the metrics in a dictionary
metrics = {}
for model, predictions in [("Logistic Regression", LR_predictions),
                           ("Gradient Boosted Trees", GBT_predictions),
                           ("Support Vector Machines", SVM_predictions)]:
    metrics[model] = {}
    metrics[model]["Accuracy"] = binary_evaluator.evaluate(predictions)
    metrics[model]["ROC AUC"] = binary_evaluator.setMetricName("areaUnderROC").evaluate(predictions)
    metrics[model]["PR AUC"] = binary_evaluator.setMetricName("areaUnderPR").evaluate(predictions)

from prettytable import PrettyTable

# Print the metrics in a table
table = PrettyTable(['Model', 'Accuracy', 'ROC AUC', 'PR AUC'])
for model, metric in metrics.items():
    row = [model]
    for m in metric.values():
        row.append('{:.2%}'.format(m))
    table.add_row(row)

print(table)

+-------------------------+----------+---------+--------+
|          Model          | Accuracy | ROC AUC | PR AUC |
+-------------------------+----------+---------+--------+
|   Logistic Regression   |  66.72%  |  66.72% | 66.16% |
|  Gradient Boosted Trees |  67.27%  |  67.70% | 67.27% |
| Support Vector Machines |  65.80%  |  66.54% | 65.80% |
+-------------------------+----------+---------+--------+


In [34]:
# Unpersist to free up memory and disk
train_data.unpersist();

# Unpersist to free up memory and disk
test_data.unpersist();