# Get Data

In [None]:
!wget -O airports.csv https://assets.datacamp.com/production/repositories/1237/datasets/6e5c4ac2a4799338ba7e13d54ce1fa918da644ba/airports.csv
!wget -O flights.csv https://assets.datacamp.com/production/repositories/1237/datasets/fa47bb54e83abd422831cbd4f441bd30fd18bd15/flights_small.csv
!wget -O planes.csv https://assets.datacamp.com/production/repositories/1237/datasets/231480a2696c55fde829ce76d936596123f12c0c/planes.csv

In [None]:
%pip install pyspark[sql]
%pip install pyspark[pandas_on_spark]

In [74]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
import pyspark.ml.evaluation as evals
import pyspark.ml.tuning as tune
import numpy as np

# Configure Spark Context

In [None]:
conf = pyspark \
  .SparkConf() \
  .setMaster("local") \
  .setAppName("DataCamp Locally") \
  .setAll([("spark.driver.memory","40g"),("spark.executor.memory","50g"),("spark.ui.port", "4040"),("spark.ui.bindAddress", "127.0.0.1")])
sc = SparkContext(conf=conf)
display(sc)

In [42]:
sc.stop()

# Configure Spark Session

In [43]:
spark = SparkSession.builder \
    .appName("DataCamp Locally") \
    .master("local") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.ui.port", "4040") \
    .config("spark.ui.bindAddress", "127.0.0.1") \
    .getOrCreate()

# Read Data CSV

In [44]:
airports = spark.read.csv("/content/airports.csv", header=True, inferSchema=True)
flights = spark.read.csv("/content/flights.csv", header=True, inferSchema=True)
planes = spark.read.csv("/content/planes.csv", header=True, inferSchema=True)

# Read CSV into table

In [45]:
# Create a temporary view
airports.createOrReplaceTempView("airports")
flights.createOrReplaceTempView("flights")
planes.createOrReplaceTempView("planes")

# Now you can run SQL queries against the "airports" table
airport_results = spark.sql("SELECT * FROM airports")
flights_results = spark.sql("SELECT * FROM airports")
planes_results = spark.sql("SELECT * FROM airports")

# Show Results of Query
airport_results.show(5)
flights_results.show(5)
planes_results.show(5)

+---+--------------------+----------+-----------+----+---+---+
|faa|                name|       lat|        lon| alt| tz|dst|
+---+--------------------+----------+-----------+----+---+---+
|04G|   Lansdowne Airport|41.1304722|-80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|32.4605722|-85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|41.9893408|-88.1012428| 801| -6|  A|
|06N|     Randall Airport| 41.431912|-74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|31.0744722|-81.4277778|  11| -4|  A|
+---+--------------------+----------+-----------+----+---+---+
only showing top 5 rows

+---+--------------------+----------+-----------+----+---+---+
|faa|                name|       lat|        lon| alt| tz|dst|
+---+--------------------+----------+-----------+----+---+---+
|04G|   Lansdowne Airport|41.1304722|-80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|32.4605722|-85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|41.9893408|-88.1012428| 801| -6|  A|
|06N|     Randall Airport| 41.

# To Pandas

In [46]:
airport_results_pd = airport_results.toPandas()
flights_results_pd = flights_results.toPandas()
planes_results_pd = planes_results.toPandas()

display(airport_results_pd)
display(flights_results_pd)
display(planes_results_pd)

Unnamed: 0,faa,name,lat,lon,alt,tz,dst
0,04G,Lansdowne Airport,41.130472,-80.619583,1044,-5,A
1,06A,Moton Field Municipal Airport,32.460572,-85.680028,264,-5,A
2,06C,Schaumburg Regional,41.989341,-88.101243,801,-6,A
3,06N,Randall Airport,41.431912,-74.391561,523,-5,A
4,09J,Jekyll Island Airport,31.074472,-81.427778,11,-4,A
...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083228,-108.791778,6454,-7,A
1393,ZVE,New Haven Rail Station,41.298669,-72.925992,7,-5,A
1394,ZWI,Wilmington Amtrak Station,39.736667,-75.551667,0,-5,A
1395,ZWU,Washington Union Station,38.897460,-77.006430,76,-5,A


Unnamed: 0,faa,name,lat,lon,alt,tz,dst
0,04G,Lansdowne Airport,41.130472,-80.619583,1044,-5,A
1,06A,Moton Field Municipal Airport,32.460572,-85.680028,264,-5,A
2,06C,Schaumburg Regional,41.989341,-88.101243,801,-6,A
3,06N,Randall Airport,41.431912,-74.391561,523,-5,A
4,09J,Jekyll Island Airport,31.074472,-81.427778,11,-4,A
...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083228,-108.791778,6454,-7,A
1393,ZVE,New Haven Rail Station,41.298669,-72.925992,7,-5,A
1394,ZWI,Wilmington Amtrak Station,39.736667,-75.551667,0,-5,A
1395,ZWU,Washington Union Station,38.897460,-77.006430,76,-5,A


Unnamed: 0,faa,name,lat,lon,alt,tz,dst
0,04G,Lansdowne Airport,41.130472,-80.619583,1044,-5,A
1,06A,Moton Field Municipal Airport,32.460572,-85.680028,264,-5,A
2,06C,Schaumburg Regional,41.989341,-88.101243,801,-6,A
3,06N,Randall Airport,41.431912,-74.391561,523,-5,A
4,09J,Jekyll Island Airport,31.074472,-81.427778,11,-4,A
...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083228,-108.791778,6454,-7,A
1393,ZVE,New Haven Rail Station,41.298669,-72.925992,7,-5,A
1394,ZWI,Wilmington Amtrak Station,39.736667,-75.551667,0,-5,A
1395,ZWU,Washington Union Station,38.897460,-77.006430,76,-5,A


# Create Columns

In [47]:
flights = spark.table("flights")
flights = flights.withColumn("duration_hrs", flights.air_time/60)
flights.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|      duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|               2.2|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|               6.0|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|              1.85|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|1.3833333333333333|
|2014|    3|  9|     754|  

# Filtering

In [48]:
flights_filter1 = flights.filter("distance > 1000")
flights_filter2 = flights.filter(flights.distance > 1000)

flights_filter1.show()
flights_filter2.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|      duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------------+
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|               6.0|
|2014|    4| 19|    1236|       -4|    1508|       -7|     AS| N309AS|   490|   SEA| SAN|     135|    1050|  12|    36|              2.25|
|2014|   11| 19|    1812|       -3|    2352|       -4|     AS| N564AS|    26|   SEA| ORD|     198|    1721|  18|    12|               3.3|
|2014|    8|  3|    1120|        0|    1415|        2|     AS| N305AS|   656|   SEA| PHX|     154|    1107|  11|    20| 2.566666666666667|
|2014|   11| 12|    2346|  

# Select & Filter

In [49]:
temp = flights.select(flights.origin, flights.dest, flights.carrier)

filter1 = flights.origin == "SEA"
filter2 = flights.dest == "PDX"

temp_filtered = temp.filter(filter1).filter(filter2)
temp_filtered.show()

+------+----+-------+
|origin|dest|carrier|
+------+----+-------+
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     AS|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     AS|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
|   SEA| PDX|     OO|
+------+----+-------+
only showing top 20 rows



In [50]:
avg_speed = (flights.distance/(flights.air_time/60)).alias("avg_speed")
speed1 = flights.select("origin", "dest", "tailnum", avg_speed)
speed1.show()
# OR
speed2 = flights.selectExpr("origin", "dest", "tailnum", "distance/(air_time/60) as avg_speed")
speed2.show()

+------+----+-------+------------------+
|origin|dest|tailnum|         avg_speed|
+------+----+-------+------------------+
|   SEA| LAX| N846VA| 433.6363636363636|
|   SEA| HNL| N559AS| 446.1666666666667|
|   SEA| SFO| N847VA|367.02702702702703|
|   PDX| SJC| N360SW| 411.3253012048193|
|   SEA| BUR| N612AS| 442.6771653543307|
|   PDX| DEN| N646SW|491.40495867768595|
|   PDX| OAK| N422WN|             362.0|
|   SEA| SFO| N361VA| 415.7142857142857|
|   SEA| SAN| N309AS| 466.6666666666667|
|   SEA| ORD| N564AS| 521.5151515151515|
|   SEA| LAX| N323AS| 440.3076923076923|
|   SEA| PHX| N305AS|431.29870129870125|
|   SEA| LAS| N433AS| 409.6062992125984|
|   SEA| ANC| N765AS|474.75409836065575|
|   SEA| SFO| N713AS| 315.8139534883721|
|   PDX| SFO| N27205| 366.6666666666667|
|   SEA| SMF| N626AS|477.63157894736844|
|   SEA| MDW| N8634A|481.38888888888886|
|   SEA| BOS| N597AS| 516.4137931034483|
|   PDX| BUR| N215AG| 441.6216216216216|
+------+----+-------+------------------+
only showing top

# Groupby

In [51]:
flights.filter(flights.origin == "PDX").groupBy().min("distance").show()

+-------------+
|min(distance)|
+-------------+
|          106|
+-------------+



In [52]:
flights.withColumn("duration_hrs", flights.air_time/60).groupBy().sum("duration_hrs").show()

+------------------+
| sum(duration_hrs)|
+------------------+
|25289.600000000126|
+------------------+



# Joining / Get Model Data

In [58]:
planes = planes.withColumnRenamed("year", "plane_year")
model_data = flights.join(planes, on="tailnum", how="leftouter")
model_data.show()

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|      duration_hrs|plane_year|                type|  manufacturer|      model|engines|seats|speed|   engine|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|               2.2|      2011|Fixed wing multi ...|        AIRBUS|   A320-214|      2|  182|   NA|Turbo-fan|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL

# String to Int

In [59]:
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))
model_data = model_data.withColumn("air_time", model_data.air_time.cast("integer"))
model_data = model_data.withColumn("month", model_data.month.cast("integer"))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast("integer"))

In [60]:
model_data = model_data.withColumn("plane_age", model_data.year - model_data.plane_year)

In [61]:
model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)
model_data = model_data.withColumn("label", model_data.is_late.cast("integer"))
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")

# StringIndexer / OneHotEncoder

In [63]:
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")
dest_indexer = StringIndexer(inputCol="dest", outputCol="dest_index")

carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")
dest_encoder = OneHotEncoder(inputCol="dest_index", outputCol="dest_fact")

# Assemble Vector

In [64]:
vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

# Pipeline

In [65]:
flights_pipe = Pipeline(stages=[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

# Fit / Transform

In [66]:
piped_data = flights_pipe.fit(model_data).transform(model_data)

# Train Test Split

In [67]:
training, test = piped_data.randomSplit([.6, .4])

# Logistics Regression

In [81]:
lr = LogisticRegression(labelCol="label", featuresCol="features")

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

# Call lr.fit()
best_lr = lr.fit(training)

# Print best_lr
print(best_lr)

# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results))

LogisticRegressionModel: uid=LogisticRegression_27ff46010057, numClasses=2, numFeatures=81
0.6840053489098065


In [82]:
spark.stop()