In [0]:
from pyspark import SparkContext
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext, SparkSession
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import types
import pyspark.sql.functions as F
SEED = 7

import warnings
warnings.filterwarnings('ignore')

In [0]:
username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
private_path = 'dbfs:/user/' + username
project_path = "dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject"




In [0]:
dataset = "test"
hours = 4      # hours before flight to put into join and pivot - WARNING: changing this mean you must also change the hard-coded query for pivot below


## Load weather and flight data

In [0]:
weather_ready = f"weather_all.parquet"
flights_ready = f"{dataset}_flights.parquet"
weather = spark.read.option("header", "true").parquet(f"{project_path}/{weather_ready}/*.parquet")
weather.printSchema()
weather.createOrReplaceTempView("weather2")

flights = spark.read.option("header", "true").parquet(f"{project_path}/{flights_ready}/*.parquet")
flights.printSchema()
flights.createOrReplaceTempView("flights2")

print("weather:",weather.count() ,"rows")
print("flights:",flights.count(), "rows")


## Augment with time block data

In [0]:
weather = spark.sql("""
  select 
    weather2.*,
    date_trunc('HOUR', date) + interval 3 hour as hour2,
    date_trunc('HOUR', date) + interval 4 hour as hour3,
    date_trunc('HOUR', date) + interval 5 hour as hour4,
    date_trunc('HOUR', date) + interval 6 hour as hour5
  from 
    weather2
""").cache()
weather.createOrReplaceTempView("weather3")
weather.printSchema()

## Join flights and weather

In [0]:
joins = []
for i in range(2, 2+hours):
  join_str = "(weather3.hour"+str(i)+" == flights2.dep_hour_utc)" 
  data = spark.sql(f"""
    select
        flights2.*,
        weather3.*,
        (weather3.airport == flights2.origin) as weather_is_origin,
        {str(i)} as hour_diff
    from 
        weather3,
        flights2
    where
        (weather3.airport == flights2.origin or weather3.airport == flights2.destination)
        and ({join_str})
    """) \
    .drop("airport","date", "hour2", "hour3", "hour4", "hour5") \
    .cache()
  data.createOrReplaceTempView("join"+str(i))
  #data.count()
  joins.append(data)

In [0]:
len(joins)

In [0]:
joins[0].printSchema()

In [0]:
data = joins[0]
for i in range(1,len(joins)):
    print(i)
    data = data.union(joins[i])
    
    

In [0]:
data.printSchema()

In [0]:
save = True
if save:
    data.write.mode("overwrite").format("parquet").save(f"{project_path}/{dataset}_join.parquet")

In [0]:
data = spark.read.option("header", "true").parquet(f"{project_path}/{dataset}_join.parquet/*.parquet")
data.count()

In [0]:
weather = spark.read.option("header", "true").parquet(f"{project_path}/{weather_ready}/*.parquet")
weather.printSchema()
weather.createOrReplaceTempView("weather2")
display(spark.sql("select * from weather2 limit 10"))

airport,date,wind_angle,wind_speed,ceiling,visibility,temperature,dewpoint,pressure
DTA,2017-01-01T06:55:00.000+0000,180,2.6,30,0,,,9999.9
DTA,2017-01-01T07:15:00.000+0000,170,2.1,30,0,-15.0,-16.0,9999.9
DTA,2017-01-01T07:35:00.000+0000,999,0.0,30,0,-15.0,-16.0,9999.9
DTA,2017-01-01T07:55:00.000+0000,999,0.0,30,0,,,9999.9
DTA,2017-01-01T08:15:00.000+0000,999,0.0,30,0,-15.0,-17.0,9999.9
DTA,2017-01-01T08:35:00.000+0000,120,2.1,30,0,-15.0,-16.0,9999.9
DTA,2017-01-01T08:55:00.000+0000,70,2.1,30,0,,,9999.9
DTA,2017-01-01T09:15:00.000+0000,70,2.1,30,402,-14.0,-15.0,9999.9
DTA,2017-01-01T09:35:00.000+0000,999,0.0,30,402,-14.0,-15.0,9999.9
DTA,2017-01-01T09:55:00.000+0000,70,2.1,30,0,,,9999.9


In [0]:
data.createOrReplaceTempView("join")

In [0]:
# display(spark.sql("""
#   select 
#     * 
#   from 
#     join 
#   where 
#     origin=='ATL' 
#     and crs_dep_time_utc < '2015-01-02'
#   limit
#     10
# """))

In [0]:
tlist = ["("+str(i)+",true) origin_"+str(i) for i in range(2,2+hours)]
flist = ["("+str(i)+",false) destination_"+str(i) for i in range(2,2+hours)]
pivot_str = ",\n".join(tlist+flist)
print(pivot_str)

In [0]:
pivot = spark.sql(f"""
    select 
        *
    from
        join
    pivot (
        avg(wind_angle) as wind_angle,
        max(wind_speed) as wind_speed,     
        min(ceiling) as ceiling,
        min(visibility) as visibility, 
        min(temperature) as temperature,
        min(dewpoint) as dewpoint,
        min(pressure) as pressure
        for (hour_diff,weather_is_origin) in (
            {pivot_str}
        )
    )
""").cache()
# pivot.count()

In [0]:
if save:
  pivot.write.mode("overwrite").format("parquet").save(f"{project_path}/{dataset}.parquet")

In [0]:
if save and dataset == "train":
    pivot \
      .filter("crs_dep_time_utc < '2015-04-01'") \
      .write.mode("overwrite").format("parquet").save(f"{project_path}/train_3m.parquet")


In [0]:
pivot = spark.read.option("header", "true").parquet(f"{project_path}/{dataset}.parquet/*.parquet")
pivot.createOrReplaceTempView("pivot")
pivot.count()

In [0]:
if dataset == "train":
  display(spark.sql("""
  select 
    * 
  from 
    pivot 
  where 
    origin=='OGG' 
    and crs_dep_time_utc > '2015-02-01'
    and crs_dep_time_utc < '2015-02-02'
  limit
    10
"""))

In [0]:
display(dbutils.fs.ls(project_path))

path,name,size
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/Chitra_feature_data/,Chitra_feature_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airline_singleday.parquet/,airline_singleday.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_features_ext.parquet/,airlines_3m_features_ext.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_full_features.parquet/,airlines_3m_full_features.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_weather_data/,airlines_weather_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport-timezones.csv,airport-timezones.csv,439779
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges/,airport_edges/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_1_year/,airport_edges_1_year/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_3_month/,airport_edges_3_month/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_4_year/,airport_edges_4_year/,0


In [0]:
print("done with", dataset)
pivot.printSchema()