# Model for predicting flight delays 

This notebook will serve as a single notebook to run all the desired models on a given dataset

## 1. Initial Setup

#### Library

In [0]:
from pyspark import SparkContext
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext, SparkSession

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import types
SEED = 7

import warnings
warnings.filterwarnings('ignore')

from IPython.display import Image

#### Spark Setting

In [0]:
# Enable for pretty viewing of tables
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

#### Project Directory

In [0]:
project_path = "dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/"
display(dbutils.fs.ls(project_path))

path,name,size
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/Chitra_feature_data/,Chitra_feature_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airline_singleday.parquet/,airline_singleday.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_features_ext.parquet/,airlines_3m_features_ext.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_full_features.parquet/,airlines_3m_full_features.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_weather_data/,airlines_weather_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport-timezones.csv,airport-timezones.csv,439779
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges/,airport_edges/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_1_year/,airport_edges_1_year/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_3_month/,airport_edges_3_month/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_4_year/,airport_edges_4_year/,0


#### Variables

In [0]:
# Load training, validation and test data directories
train_dir = "train.parquet"
dev_dir = None
test_dir = None
mode = 'full'
outcome = 'is_delayed'
run = 'full'
SMOTE = False

save_path = project_path + "Chitra_feature_data"
loadPipelineFromFile = False
savePipeline = False

if train_dir == 'train.parquet':
  mode = 'full'
  dev_dir = "dev.parquet"
  test_dir = "test.parquet"

print(f'Project Path: {project_path} mode:{mode}')

# 2. Data

### 2.1 Load the Data

In [0]:
# Load the training data
def loadData(dirName, viewName):
  data = spark.read.option("header", "true").parquet(f'{project_path}{dirName}/*.parquet')
  data.createOrReplaceTempView(viewName)
  return data

train_data = loadData(train_dir, 'train_data')

In [0]:
display(train_data)

origin,destination,year,month,day_of_month,day_of_week,crs_dep_time_utc,naive_crs_arr_time_utc,tail_num,op_carrier,distance,dep_blk,dep_del15,nas_delay,carrier_delay,weather_delay,security_delay,late_aircraft_delay,arr_delay,crs_arr_time_utc,arr_hour_utc,dep_hour_utc,hour,del_tail_num,equipment_delayed,dest_pagerank,src_pagerank,weight,avg_trips_per_day,avg_tp_per_day,avg_flights_per_day,avg_car_per_day,avg_car_flights_per_day,origin_2_wind_angle,origin_2_wind_speed,origin_2_ceiling,origin_2_visibility,origin_2_temperature,origin_2_dewpoint,origin_2_pressure,origin_3_wind_angle,origin_3_wind_speed,origin_3_ceiling,origin_3_visibility,origin_3_temperature,origin_3_dewpoint,origin_3_pressure,origin_4_wind_angle,origin_4_wind_speed,origin_4_ceiling,origin_4_visibility,origin_4_temperature,origin_4_dewpoint,origin_4_pressure,origin_5_wind_angle,origin_5_wind_speed,origin_5_ceiling,origin_5_visibility,origin_5_temperature,origin_5_dewpoint,origin_5_pressure,destination_2_wind_angle,destination_2_wind_speed,destination_2_ceiling,destination_2_visibility,destination_2_temperature,destination_2_dewpoint,destination_2_pressure,destination_3_wind_angle,destination_3_wind_speed,destination_3_ceiling,destination_3_visibility,destination_3_temperature,destination_3_dewpoint,destination_3_pressure,destination_4_wind_angle,destination_4_wind_speed,destination_4_ceiling,destination_4_visibility,destination_4_temperature,destination_4_dewpoint,destination_4_pressure,destination_5_wind_angle,destination_5_wind_speed,destination_5_ceiling,destination_5_visibility,destination_5_temperature,destination_5_dewpoint,destination_5_pressure
ABE,ATL,2015,1,30,5,2015-01-30T11:35:00.000+0000,2015-01-30T14:04:00.000+0000,N603AT,DL,692.0,6,0.0,,,,,,-11.0,2015-01-30T14:04:00.000+0000,2015-01-30T14:00:00.000+0000,2015-01-30T11:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,2,1.03,751.75,6.7,792.0,11265.0,-20.6,-24.4,1011.9,20.0,6.2,884.0,11265.0,-20.0,-24.4,1012.2,469.6666666666667,7.7,945.0,8047.0,-20.0,-24.4,1013.0,751.75,5.7,396.0,9656.0,-20.0,-24.4,1013.6,330.0,10.3,732.0,16093.0,6.1,-0.6,1021.9,335.0,11.8,792.0,16093.0,8.3,2.2,1021.1,320.0,9.3,975.0,16093.0,10.6,3.9,1020.1,280.0,4.1,1067.0,16093.0,12.8,3.3,1019.1
ABE,ATL,2015,3,31,2,2015-03-31T16:02:00.000+0000,2015-03-31T18:10:00.000+0000,N916EV,EV,692.0,12,0.0,,,,,,-18.0,2015-03-31T18:10:00.000+0000,2015-03-31T18:00:00.000+0000,2015-03-31T16:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.02,8,2.64,4,1.7,275.0,2.6,2134.0,16093.0,-12.8,-14.4,1005.2,629.5,3.1,22000.0,16093.0,-13.3,-14.4,1005.2,624.5,3.6,22000.0,16093.0,-12.8,-13.9,1004.9,275.0,6.2,22000.0,16093.0,-11.7,-13.3,1004.7,250.0,3.1,488.0,16093.0,1.0,0.0,1003.2,474.75,1.5,427.0,16093.0,1.0,0.0,1003.1,220.0,1.5,22000.0,16093.0,9.4,4.4,1018.3,230.0,1.5,22000.0,16093.0,8.9,4.4,1017.8
ABE,ATL,2015,4,24,5,2015-04-24T10:40:00.000+0000,2015-04-24T12:47:00.000+0000,N906AT,DL,692.0,6,0.0,,,,,,4.0,2015-04-24T12:47:00.000+0000,2015-04-24T12:00:00.000+0000,2015-04-24T10:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,2,1.03,195.0,7.2,1280.0,16093.0,2.2,-6.7,1000.9,155.0,5.1,884.0,11265.0,2.8,-6.1,1000.9,210.0,6.2,1189.0,16093.0,3.3,-6.7,1001.0,200.0,4.1,1311.0,16093.0,3.9,-6.7,1000.8,320.0,3.6,22000.0,16093.0,11.1,1.7,1016.0,320.0,4.1,22000.0,16093.0,12.2,1.7,1015.8,320.0,4.1,22000.0,16093.0,12.8,3.3,1016.0,320.0,4.1,22000.0,16093.0,13.3,5.0,1016.2
ABE,ATL,2015,5,7,4,2015-05-07T16:02:00.000+0000,2015-05-07T18:08:00.000+0000,N884AS,EV,692.0,12,0.0,,,,,,-5.0,2015-05-07T18:08:00.000+0000,2015-05-07T18:00:00.000+0000,2015-05-07T16:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,4,1.7,614.5,1.5,1524.0,16093.0,6.1,3.3,1006.8,579.5,1.5,1402.0,16093.0,4.4,2.8,1007.4,559.5,3.1,1402.0,16093.0,5.0,3.3,1007.6,95.0,2.6,1341.0,16093.0,5.0,3.9,1007.9,350.0,2.6,22000.0,16093.0,24.4,12.2,1020.4,350.0,2.6,22000.0,16093.0,22.8,11.7,1020.4,20.0,1.5,22000.0,16093.0,18.9,12.8,1020.2,30.0,2.1,22000.0,16093.0,18.3,11.1,1019.3
ABE,ATL,2015,6,12,5,2015-06-12T16:02:00.000+0000,2015-06-12T18:10:00.000+0000,N844AS,EV,692.0,12,0.0,,,,,,-10.0,2015-06-12T18:10:00.000+0000,2015-06-12T18:00:00.000+0000,2015-06-12T16:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,4,1.7,163.33333333333334,6.2,274.0,11265.0,6.1,3.9,1014.6,120.0,5.1,305.0,11265.0,6.7,4.4,1014.9,163.33333333333334,4.6,427.0,8047.0,7.0,5.0,1014.7,416.3333333333333,4.6,457.0,3219.0,6.7,5.0,1014.8,486.3333333333333,3.1,396.0,16093.0,24.0,20.0,1019.5,230.0,4.1,335.0,16093.0,22.8,19.0,1019.5,230.0,3.6,244.0,16093.0,21.7,19.4,1018.9,220.0,2.1,274.0,16093.0,21.0,19.0,1018.1
ABE,ATL,2015,8,21,5,2015-08-21T16:00:00.000+0000,2015-08-21T18:08:00.000+0000,N852AS,EV,692.0,12,0.0,,,,,,-10.0,2015-08-21T18:08:00.000+0000,2015-08-21T18:00:00.000+0000,2015-08-21T16:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,4,1.7,160.0,4.6,1981.0,16093.0,9.4,8.9,1017.9,160.0,3.1,1524.0,16093.0,9.4,8.3,1017.7,320.0,3.1,2134.0,16093.0,10.0,9.4,1017.1,300.0,3.1,1128.0,16093.0,11.1,9.4,1016.3,330.0,1.5,5486.0,16093.0,27.2,21.7,1017.9,320.0,1.5,6096.0,16093.0,24.4,21.7,1017.5,320.0,1.5,4267.0,14484.0,23.3,21.7,1017.2,310.0,2.6,4267.0,14484.0,23.3,21.7,1016.5
ABE,ATL,2015,8,26,3,2015-08-26T16:00:00.000+0000,2015-08-26T18:08:00.000+0000,N861AS,EV,692.0,12,0.0,,,,,,1.0,2015-08-26T18:08:00.000+0000,2015-08-26T18:00:00.000+0000,2015-08-26T16:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.04,8,2.64,4,1.7,609.5,11.8,213.0,16093.0,12.2,11.1,1013.1,237.5,12.4,244.0,3219.0,12.2,11.1,1013.1,599.5,11.8,213.0,6437.0,12.2,11.1,1013.0,213.33333333333331,13.4,152.0,4828.0,12.2,11.1,1012.9,275.0,6.2,152.0,9656.0,9.0,8.0,1010.9,320.0,3.6,22000.0,16093.0,18.3,13.3,1014.9,260.0,8.2,122.0,6437.0,9.0,8.0,1012.7,320.0,2.6,22000.0,16093.0,15.6,12.2,1013.7
ABE,ATL,2016,3,5,6,2016-03-05T11:40:00.000+0000,2016-03-05T13:55:00.000+0000,N990DL,DL,692.0,6,0.0,,,,,,3.0,2016-03-05T13:55:00.000+0000,2016-03-05T13:00:00.000+0000,2016-03-05T11:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.0,8,2.64,2,1.03,160.0,7.7,1097.0,16093.0,-8.9,-11.1,993.8,340.0,8.2,1158.0,16093.0,-8.9,-11.7,994.2,335.0,8.8,1097.0,16093.0,-9.4,-11.7,994.6,345.0,7.7,22000.0,16093.0,-8.9,-11.7,995.2,190.0,5.1,792.0,16093.0,-9.0,-13.0,993.2,190.0,4.6,853.0,16093.0,-9.0,-13.0,993.2,320.0,3.1,22000.0,16093.0,3.3,0.6,1020.9,300.0,2.1,22000.0,16093.0,4.4,1.1,1020.8
ABE,ATL,2016,8,30,2,2016-08-30T17:11:00.000+0000,2016-08-30T19:24:00.000+0000,N391CA,EV,692.0,13,0.0,,,,,,-34.0,2016-08-30T19:24:00.000+0000,2016-08-30T19:00:00.000+0000,2016-08-30T17:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.05,8,2.64,4,1.7,679.5,3.6,22000.0,16093.0,8.9,8.3,1018.9,679.5,4.1,22000.0,16093.0,9.4,8.3,1018.9,674.5,3.6,22000.0,16093.0,10.0,8.9,1019.0,669.5,4.1,22000.0,16093.0,10.6,8.9,1019.1,70.0,6.2,22000.0,16093.0,27.8,20.0,1018.8,70.0,4.1,22000.0,16093.0,26.7,21.1,1018.6,70.0,4.1,22000.0,16093.0,25.0,20.6,1018.5,70.0,4.6,22000.0,16093.0,23.3,20.0,1018.2
ABE,ATL,2016,9,1,4,2016-09-01T21:50:00.000+0000,2016-09-02T00:07:00.000+0000,N753EV,EV,692.0,17,1.0,14.0,0.0,0.0,0.0,40.0,54.0,2016-09-02T00:07:00.000+0000,2016-09-02T00:00:00.000+0000,2016-09-01T21:00:00.000+0000,,,False,9.031150214803477,0.5947237603125133,0.0909090909090909,1.05,8,2.64,4,1.7,664.5,4.1,2286.0,16093.0,13.3,11.1,1011.0,779.3333333333334,2.1,30.0,402.0,8.3,8.3,1011.2,493.375,2.1,30.0,402.0,8.0,8.0,1011.3,545.6,2.1,732.0,1207.0,8.3,8.3,1011.8,999.0,2.1,7620.0,16093.0,32.8,18.9,1010.7,180.0,3.6,7620.0,16093.0,32.8,19.4,1011.5,140.0,3.1,22000.0,16093.0,31.7,20.6,1012.0,999.0,0.0,22000.0,16093.0,29.4,21.7,1012.7


In [0]:
train_data.count()

### 2.2 Preprocess and Split Data

We will be doing a time based split to make sure we have a training data, validation data to validate models and the test data,

In [0]:
def preProcessData(dff, outcome = 'is_delayed'):
  # filter out all delays related to security delays
  sec = dff.where('dep_del15 == 1') \
        .where(f.col('security_delay') > 15) \
        .where(f.greatest(*[f.col(x) for x in ['carrier_delay', 'nas_delay', 'weather_delay', 'security_delay', 'late_aircraft_delay']]) == f.col('security_delay'))
  
  dff = dff.subtract(sec)
  
  # Convert visibility and ceiling to integer values
  for col in [x for x in dff.columns if x.endswith('_visibility') or x.endswith('_ceiling')]:
    dff = dff.withColumn(col, dff[col].cast(IntegerType()))

  # Convert temperature and dewpoint to Kelvin by adding 273
  for col in [x for x in dff.columns if x.endswith('_temperature') or x.endswith('_dewpoint')]:
    dff = dff.withColumn(col, dff[col] + 273)
    
  dff = dff.withColumn('time_blk', ((dff['dep_blk'].cast(IntegerType()) + 2) / 8).cast(IntegerType()))
  # cast the equipment_delayed to numeric int
  dff = dff.withColumn('equipment_delayed', dff['equipment_delayed'].cast(IntegerType()))
  
  # create the outcome column
  dff = dff.withColumn(outcome, f.when(f.col('dep_del15') == 1, 1)
                                .otherwise(0)
  #dff = dff.withColumn(outcome, f.when(f.col('dep_del15') == 0, 0)
  #                                    .when(f.col('weather_delay') > 15, 1)
  #                                    .when(f.col('nas_delay') > 15, 1)
  #                                    .when(f.col('carrier_delay') > 15, 1)
  #                                    .when(f.col('late_aircraft_delay') > 15, 1)
  #                                    .otherwise(0)
                      )
  return dff

def partitionData(dff, start_date, end_date):
    return dff.filter(f"make_date(year, month, day_of_month) between '{start_date}' and '{end_date}'")

In [0]:
if mode == 'sample':
  temp = preProcessData(train_data)
  train_data = partitionData(temp, '2015-01-01', '2015-02-28')
  dev_data = partitionData(temp, '2015-03-01', '2015-03-15')
  test_data = partitionData(temp, '2015-03-16', '2015-03-31')
else:
  train_data = preProcessData(train_data).cache()
  dev_data = preProcessData(loadData(dev_dir, 'dev'))
  test_data = preProcessData(loadData(test_dir, 'test'))

In [0]:
train_class_counts = train_data.groupBy(outcome).count().collect()
print(train_class_counts)

In [0]:
display(train_data.limit(10))

origin,destination,year,month,day_of_month,day_of_week,crs_dep_time_utc,naive_crs_arr_time_utc,tail_num,op_carrier,distance,dep_blk,dep_del15,nas_delay,carrier_delay,weather_delay,security_delay,late_aircraft_delay,arr_delay,crs_arr_time_utc,arr_hour_utc,dep_hour_utc,hour,del_tail_num,equipment_delayed,dest_pagerank,src_pagerank,weight,avg_trips_per_day,avg_tp_per_day,avg_flights_per_day,avg_car_per_day,avg_car_flights_per_day,origin_2_wind_angle,origin_2_wind_speed,origin_2_ceiling,origin_2_visibility,origin_2_temperature,origin_2_dewpoint,origin_2_pressure,origin_3_wind_angle,origin_3_wind_speed,origin_3_ceiling,origin_3_visibility,origin_3_temperature,origin_3_dewpoint,origin_3_pressure,origin_4_wind_angle,origin_4_wind_speed,origin_4_ceiling,origin_4_visibility,origin_4_temperature,origin_4_dewpoint,origin_4_pressure,origin_5_wind_angle,origin_5_wind_speed,origin_5_ceiling,origin_5_visibility,origin_5_temperature,origin_5_dewpoint,origin_5_pressure,destination_2_wind_angle,destination_2_wind_speed,destination_2_ceiling,destination_2_visibility,destination_2_temperature,destination_2_dewpoint,destination_2_pressure,destination_3_wind_angle,destination_3_wind_speed,destination_3_ceiling,destination_3_visibility,destination_3_temperature,destination_3_dewpoint,destination_3_pressure,destination_4_wind_angle,destination_4_wind_speed,destination_4_ceiling,destination_4_visibility,destination_4_temperature,destination_4_dewpoint,destination_4_pressure,destination_5_wind_angle,destination_5_wind_speed,destination_5_ceiling,destination_5_visibility,destination_5_temperature,destination_5_dewpoint,destination_5_pressure,time_blk,is_delayed
ABQ,LAS,2018,9,28,5,2018-09-29T03:30:00.000+0000,2018-09-29T03:55:00.000+0000,N7869A,WN,486.0,21,0.0,,,,,,-15.0,2018-09-29T03:55:00.000+0000,2018-09-29T03:00:00.000+0000,2018-09-29T03:00:00.000+0000,,,0,5.478207812481511,1.2649364410677846,0.037037037037037,1.0,56,3.82,32,3.77,180.0,2.1,22000,16093,300.8,269.7,1011.0,180.0,2.1,22000,16093,302.4,268.0,1010.7,190.0,3.1,22000,16093,303.0,266.3,1009.9,999.0,2.1,22000,16093,302.4,265.2,1010.6,999.0,0.0,7620.0,16093.0,309.1,274.7,1005.0,50.0,2.6,7620.0,16093.0,310.2,273.6,1005.2,70.0,4.1,7620.0,16093.0,310.2,273.0,1006.0,100.0,4.6,7620.0,16093.0,310.2,271.9,1006.5,2,0
ALB,ORD,2017,2,20,1,2017-02-20T21:51:00.000+0000,2017-02-20T23:29:00.000+0000,N831UA,UA,723.0,16,0.0,,,,,,-16.0,2017-02-20T23:29:00.000+0000,2017-02-20T23:00:00.000+0000,2017-02-20T21:00:00.000+0000,,,0,10.191810455848527,0.98024290978552,0.05,1.0,27,3.39,3,2.19,20.0,4.6,22000,16093,278.6,265.8,1024.8,30.0,5.1,22000,16093,278.0,266.3,1024.4,310.0,5.1,22000,16093,278.0,266.3,1024.1,300.0,6.7,22000,16093,276.9,266.9,1023.9,140.0,7.7,7620.0,16093.0,292.4,281.9,1016.8,160.0,7.2,7620.0,16093.0,291.3,281.3,1017.7,170.0,7.7,6401.0,12875.0,289.7,281.3,1018.8,165.0,6.2,6401.0,8047.0,286.9,281.3,1019.8,2,0
ANC,OTZ,2015,9,9,3,2015-09-10T00:40:00.000+0000,2015-09-10T02:10:00.000+0000,N703AS,AS,548.0,16,0.0,,,,,,-7.0,2015-09-10T02:10:00.000+0000,2015-09-10T02:00:00.000+0000,2015-09-10T00:00:00.000+0000,,,0,0.3705984298676688,3.039467238765807,0.032258064516129,1.42,48,1.85,38,1.85,90.0,2.6,6096,16093,289.1,280.2,1001.0,60.0,2.6,4572,16093,287.4,281.9,1001.5,60.0,4.1,1829,16093,285.8,282.4,1002.0,40.0,2.1,1676,16093,285.2,283.0,1002.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0
ASE,DEN,2017,11,4,6,2017-11-04T22:55:00.000+0000,2017-11-04T23:51:00.000+0000,N742SK,OO,125.0,16,1.0,0.0,0.0,0.0,0.0,37.0,37.0,2017-11-04T23:51:00.000+0000,2017-11-04T23:00:00.000+0000,2017-11-04T22:00:00.000+0000,,,0,9.487754969011524,0.625536468320466,0.1,1.37,14,5.49,14,5.49,170.0,8.8,22000,16093,281.9,269.7,1006.3,210.0,8.2,1676,16093,283.0,265.8,1007.2,150.0,6.7,1676,16093,282.4,271.3,1007.2,140.0,4.6,2134,16093,279.7,274.1,1009.5,230.0,10.3,22000.0,16093.0,294.7,269.1,1003.0,240.0,10.3,22000.0,16093.0,294.7,269.7,1004.2,240.0,9.8,22000.0,16093.0,293.6,271.9,1004.8,240.0,9.3,22000.0,16093.0,291.9,271.9,1006.0,2,1
ATL,BOS,2018,8,16,4,2018-08-16T11:45:00.000+0000,2018-08-16T14:17:00.000+0000,N333DX,DL,946.0,7,0.0,,,,,,-5.0,2018-08-16T14:17:00.000+0000,2018-08-16T14:00:00.000+0000,2018-08-16T11:00:00.000+0000,,,0,2.909164282524018,9.031150214803477,0.0058479532163742,1.93,1037,16.18,669,10.83,999.0,0.0,22000,16093,296.3,293.6,1018.9,220.0,1.5,22000,16093,296.9,293.0,1018.5,190.0,1.5,22000,16093,298.0,292.4,1018.5,140.0,2.1,7620,16093,298.0,293.0,1018.5,290.0,4.6,22000.0,16093.0,298.0,293.0,1013.3,280.0,4.6,22000.0,16093.0,298.6,293.0,1013.1,290.0,4.6,22000.0,16093.0,299.1,293.0,1012.5,290.0,5.1,22000.0,16093.0,299.7,293.0,1012.4,1,0
ATL,CHS,2016,7,26,2,2016-07-26T19:02:00.000+0000,2016-07-26T20:20:00.000+0000,N926DL,DL,259.0,15,0.0,,,,,,-8.0,2016-07-26T20:20:00.000+0000,2016-07-26T20:00:00.000+0000,2016-07-26T19:00:00.000+0000,,,0,1.3491508550951237,9.031150214803477,0.0058479532163742,2.62,1037,9.72,669,9.64,210.0,2.6,1524,16093,286.0,284.0,1010.6,629.5,3.6,1981,16093,286.0,284.0,1010.1,270.0,2.6,22000,16093,304.1,296.3,1017.6,999.0,0.0,22000,16093,302.4,296.3,1017.8,999.0,2.1,22000.0,16093.0,307.4,294.7,1017.2,290.0,3.1,22000.0,16093.0,306.9,295.8,1017.8,999.0,1.5,22000.0,16093.0,304.7,296.3,1017.9,270.0,3.1,22000.0,16093.0,303.6,295.8,1018.0,2,0
ATL,CMH,2016,3,18,5,2016-03-18T12:35:00.000+0000,2016-03-18T14:15:00.000+0000,N790SW,WN,447.0,8,0.0,,,,,,-19.0,2016-03-18T14:15:00.000+0000,2016-03-18T14:00:00.000+0000,2016-03-18T12:00:00.000+0000,,,0,1.6602815535904505,9.031150214803477,0.0058479532163742,1.14,1037,10.24,117,2.56,310.0,2.1,22000,16093,286.3,276.3,1013.2,280.0,3.1,22000,16093,286.9,276.9,1013.1,260.0,2.1,7620,16093,288.0,276.3,1013.1,270.0,2.6,6096,16093,289.1,275.8,1012.9,100.0,2.1,2896.0,16093.0,276.3,270.8,1014.6,140.0,1.5,3048.0,16093.0,277.4,270.2,1014.3,140.0,1.5,22000.0,16093.0,277.4,270.2,1014.1,999.0,0.0,22000.0,16093.0,279.7,269.1,1014.4,1,0
ATL,DAY,2016,1,18,1,2016-01-18T17:00:00.000+0000,2016-01-18T18:35:00.000+0000,N950DN,DL,432.0,12,0.0,,,,,,-16.0,2016-01-18T18:35:00.000+0000,2016-01-18T18:00:00.000+0000,2016-01-18T17:00:00.000+0000,,,0,1.0361003781025055,9.031150214803477,0.0058479532163742,2.31,1037,3.82,669,3.82,205.0,8.2,366,16093,257.0,255.0,1000.3,166.66666666666666,7.2,335,16093,258.0,256.0,1000.2,205.0,5.7,335,16093,258.0,256.0,1000.3,320.0,8.2,22000,16093,271.9,266.9,1021.6,250.0,7.2,22000.0,16093.0,258.0,253.0,1026.9,280.0,6.7,22000.0,16093.0,256.9,252.4,1026.2,270.0,7.2,22000.0,16093.0,256.3,251.9,1025.7,270.0,6.2,22000.0,16093.0,256.3,253.0,1025.0,1,0
ATL,DCA,2017,6,7,3,2017-06-07T13:45:00.000+0000,2017-06-07T15:28:00.000+0000,N303DN,DL,547.0,9,0.0,,,,,,-20.0,2017-06-07T15:28:00.000+0000,2017-06-07T15:00:00.000+0000,2017-06-07T13:00:00.000+0000,,,0,3.88774386261516,9.031150214803477,0.0058479532163742,2.45,1037,17.6,669,12.25,45.0,7.7,488,16093,293.6,291.9,1007.9,40.0,4.6,22000,16093,294.1,291.3,1007.3,20.0,2.6,5486,16093,294.1,291.9,1007.2,10.0,1.5,5486,16093,294.1,291.9,1006.9,80.0,4.6,1311.0,16093.0,290.0,288.0,1012.7,70.0,3.1,884.0,16093.0,290.2,288.0,1012.1,76.66666666666667,4.1,823.0,16093.0,290.2,288.6,1011.8,65.0,3.1,335.0,16093.0,290.2,288.6,1011.3,1,0
ATL,DEN,2018,3,13,2,2018-03-13T18:50:00.000+0000,2018-03-13T20:10:00.000+0000,N7845A,WN,1199.0,14,0.0,,,,,,-3.0,2018-03-13T20:10:00.000+0000,2018-03-13T20:00:00.000+0000,2018-03-13T18:00:00.000+0000,,,0,9.487754969011524,9.031150214803477,0.0058479532163742,1.13,1037,15.67,117,3.78,340.0,5.7,22000,16093,279.1,270.8,1021.1,330.0,5.7,22000,16093,277.4,270.2,1021.2,330.0,6.7,22000,16093,276.3,270.8,1021.1,330.0,6.7,22000,16093,275.2,269.7,1020.5,270.0,3.6,22000.0,16093.0,276.9,266.9,1025.2,240.0,3.6,22000.0,16093.0,274.1,266.3,1025.7,250.0,3.1,22000.0,16093.0,270.8,265.2,1025.8,270.0,3.6,22000.0,16093.0,270.2,264.7,1025.1,2,0


### 2.3 Get the Features and Target Vars from Data

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Imputer, StandardScaler, FeatureHasher, SQLTransformer
from pyspark.ml import Pipeline


# a more parallel attempt

def createStages2(dff, outcome, run = 'basic'):
  stages = []
  
  # Get the categorical columns
  categoricalColumnsNoAirports = ['month', 'day_of_month', 'day_of_week', 'op_carrier', 'dep_blk']
  # add equipement_delayed for non-basic runs
  if run != 'basic':
    categoricalColumnsNoAirports += ['equipment_delayed']
  # and our full set includes origin and destination for string indexing 
  filterColumns = ['origin', 'destination'] 
  categoricalColumns = filterColumns + categoricalColumnsNoAirports
  
  
  # index their strings. sadly, no parallel method for that
  stringIndexers = [StringIndexer(inputCol = col, outputCol = col + 'Index').setHandleInvalid("keep") for col in categoricalColumns]
  stages += stringIndexers
          
  # one-hot encode the categorical columns
  encoder = OneHotEncoder(inputCols=[col + 'Index' for col in categoricalColumnsNoAirports], \
                          outputCols=[col + "classVec" for col in categoricalColumnsNoAirports], \
                          handleInvalid='keep')
  stages += [encoder]
  
  # add the outcome label
  label_stringIdx = StringIndexer(inputCol = outcome, outputCol = 'label').setHandleInvalid("keep")
  stages += [label_stringIdx]

  # Get numeric columns
  numericCols = ['distance', 'dest_pagerank', 'src_pagerank', 'weight']
  if run != 'basic':
      numericCols += [x for x in dff.columns if x.startswith('avg_')] + \
      [x for x in dff.columns if (x.startswith('origin_') or x.startswith('destination_'))]
  
  # impute, vectorize and scale numeric columns
  imputedCols = [col+"_Imputed" for col in numericCols]
  imputer = Imputer(inputCols=numericCols, outputCols=imputedCols, strategy='mean')
  
  # from our runs, it looks like the parallel imputer might not have a deterministic output column order
  # make sure the resulting columns are in a defined order
  cols = ['label', 'originIndex', 'destinationIndex'] + [c+"classVec" for c in categoricalColumnsNoAirports] + imputedCols
  col_string = ','.join(cols)
  feature_projector = SQLTransformer(statement=f"SELECT {col_string} FROM __THIS__")
  
  # now convert some of this into a vector and scale its dimensions independently
  assembler = VectorAssembler(inputCols=imputedCols,outputCol="numeric_vect", handleInvalid='keep')
  scaler = StandardScaler(inputCol="numeric_vect", outputCol="numeric_scaled")
  stages += [imputer, feature_projector, assembler, scaler]
  
  # put all the features together
  feature_assembler = VectorAssembler(inputCols=[c + "classVec" for c in categoricalColumnsNoAirports] + ["numeric_scaled"], outputCol="features")
  hash_assembler = VectorAssembler(inputCols=["originIndex", "destinationIndex", "numeric_scaled"], outputCol="hash_vector")
  # final projection of columns, review this carefully
  outputCols = ['label', 'originIndex', 'destinationIndex', 'hash_vector', 'numeric_scaled', 'features'] + [c + "classVec" for c in categoricalColumnsNoAirports]
  col_string = ','.join(outputCols)
  final_projector = SQLTransformer(statement=f"SELECT {col_string} FROM __THIS__")
    
  stages += [feature_assembler, hash_assembler, final_projector]
  
  return stages

In [0]:
from pyspark.ml import PipelineModel
pipelineModel = None
savePipeline = False

#'pipeline-model' is for all features -- 827 plus features
#'pipeline-model2' is for all features -- 161 features

modelName = 'pipeline-model2' 

if not loadPipelineFromFile:
  # Create the pipeline and fit to the training data
  stages = createStages2(train_data, outcome, run)
  print(stages)
  pipeline = Pipeline(stages = stages)
  pipelineModel = pipeline.fit(train_data)
  if savePipeline:
    filename = f'{save_path}/{modelName}'
    pipelineModel.save(filename)
else:
  pipelineModel = PipelineModel.load(f'{save_path}/{modelName}')

In [0]:
#if savePipeline:
#    filename = f'{save_path}/pipeline-model2'
#    pipelineModel.write().overwrite().save(filename)

In [0]:
pipelineModel.transform(train_data).printSchema()

In [0]:
# Transform the train, dev and test data with the pipeline
def transformData (d, pl_model):
  return pl_model.transform(d)


# Transform the training, dev and test data
train = transformData(train_data, pipelineModel).cache()
dev = transformData(dev_data, pipelineModel)
test = transformData(test_data, pipelineModel)

train.printSchema()

In [0]:
#print(stages[12].getNumFeatures())
sample = train.limit(1)
display(sample)


label,originIndex,destinationIndex,hash_vector,numeric_scaled,features,monthclassVec,day_of_monthclassVec,day_of_weekclassVec,op_carrierclassVec,dep_blkclassVec,equipment_delayedclassVec
0.0,56.0,7.0,"Map(vectorType -> dense, length -> 67, values -> List(56.0, 7.0, 0.7948412686346508, 1.8270567454550266, 0.4219768683292819, 0.4146452116827185, 1.875029274731359, 0.20244914228956434, 0.554930229121567, 0.22531392182554547, 1.052833884279082, 0.6272329545746582, 0.07430598561073339, 2.2725908495301637, 1.378256250397911, 17.17900240842907, 12.670126129780195, 1.4489974947884703, 0.617535137099471, 0.07412968463414162, 2.266672012102878, 1.3385182506317215, 17.13110579065205, 12.385585810092378, 1.4511717707585658, 0.6436869285121023, 0.11079560050185237, 2.2608666789538874, 1.3148421098087018, 16.76981487167717, 12.069971921715855, 1.454908459755531, 3.350972263196412, 0.07628368321780006, 2.2556362496159967, 1.2703530128817542, 16.429844371920915, 11.844887014387908, 1.4546436935000662, 3.5046876129533198, 0.0, 0.7877945434677842, 1.4635824468181546, 17.21022235010804, 12.653688794195219, 1.4491312248179193, 0.17287216336891842, 0.09277693648101934, 0.7854710573347506, 1.3673234544532553, 16.60757102001861, 12.213135271825756, 1.4529229304744435, 0.2387731889171825, 0.14592678416002303, 0.7831040906448445, 1.2991967652480685, 16.32088634918405, 11.957548545036495, 1.453771641607374, 0.337659554198868, 0.16288688724750766, 0.7816954879665293, 1.2717020213505028, 16.108531354922356, 11.76645744057589, 1.4564286480862552))","Map(vectorType -> dense, length -> 65, values -> List(0.7948412686346508, 1.8270567454550266, 0.4219768683292819, 0.4146452116827185, 1.875029274731359, 0.20244914228956434, 0.554930229121567, 0.22531392182554547, 1.052833884279082, 0.6272329545746582, 0.07430598561073339, 2.2725908495301637, 1.378256250397911, 17.17900240842907, 12.670126129780195, 1.4489974947884703, 0.617535137099471, 0.07412968463414162, 2.266672012102878, 1.3385182506317215, 17.13110579065205, 12.385585810092378, 1.4511717707585658, 0.6436869285121023, 0.11079560050185237, 2.2608666789538874, 1.3148421098087018, 16.76981487167717, 12.069971921715855, 1.454908459755531, 3.350972263196412, 0.07628368321780006, 2.2556362496159967, 1.2703530128817542, 16.429844371920915, 11.844887014387908, 1.4546436935000662, 3.5046876129533198, 0.0, 0.7877945434677842, 1.4635824468181546, 17.21022235010804, 12.653688794195219, 1.4491312248179193, 0.17287216336891842, 0.09277693648101934, 0.7854710573347506, 1.3673234544532553, 16.60757102001861, 12.213135271825756, 1.4529229304744435, 0.2387731889171825, 0.14592678416002303, 0.7831040906448445, 1.2991967652480685, 16.32088634918405, 11.957548545036495, 1.453771641607374, 0.337659554198868, 0.16288688724750766, 0.7816954879665293, 1.2717020213505028, 16.108531354922356, 11.76645744057589, 1.4564286480862552))","Map(vectorType -> sparse, length -> 161, indices -> List(5, 21, 45, 53, 88, 93, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7948412686346508, 1.8270567454550266, 0.4219768683292819, 0.4146452116827185, 1.875029274731359, 0.20244914228956434, 0.554930229121567, 0.22531392182554547, 1.052833884279082, 0.6272329545746582, 0.07430598561073339, 2.2725908495301637, 1.378256250397911, 17.17900240842907, 12.670126129780195, 1.4489974947884703, 0.617535137099471, 0.07412968463414162, 2.266672012102878, 1.3385182506317215, 17.13110579065205, 12.385585810092378, 1.4511717707585658, 0.6436869285121023, 0.11079560050185237, 2.2608666789538874, 1.3148421098087018, 16.76981487167717, 12.069971921715855, 1.454908459755531, 3.350972263196412, 0.07628368321780006, 2.2556362496159967, 1.2703530128817542, 16.429844371920915, 11.844887014387908, 1.4546436935000662, 3.5046876129533198, 0.7877945434677842, 1.4635824468181546, 17.21022235010804, 12.653688794195219, 1.4491312248179193, 0.17287216336891842, 0.09277693648101934, 0.7854710573347506, 1.3673234544532553, 16.60757102001861, 12.213135271825756, 1.4529229304744435, 0.2387731889171825, 0.14592678416002303, 0.7831040906448445, 1.2991967652480685, 16.32088634918405, 11.957548545036495, 1.453771641607374, 0.337659554198868, 0.16288688724750766, 0.7816954879665293, 1.2717020213505028, 16.108531354922356, 11.76645744057589, 1.4564286480862552))","Map(vectorType -> sparse, length -> 13, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 32, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 8, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 20, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 20, indices -> List(15), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"


In [0]:
import datetime
import calendar

def file_suffix():
  now = datetime.datetime.now(tz=datetime.timezone.utc)
  day = calendar.day_name[now.weekday()]
  hour = now.hour
  min = now.minute

  return f"{day}_{hour}-{min}"


In [0]:
save_transformed = True
if save_transformed:
  suffix = file_suffix()
  train.write.mode("overwrite").format("parquet").save(f"{project_path}/train_transformed_{suffix}.parquet")
  dev.write.mode("overwrite").format("parquet").save(f"{project_path}/dev_transformed_{suffix}.parquet")
  test.write.mode("overwrite").format("parquet").save(f"{project_path}/test_transformed_{suffix}.parquet")


In [0]:
project_path

In [0]:
load_transformed = True
if load_transformed:
  train = spark.read.option("header", "true").parquet(f"{project_path}//train_transformed_Monday_0-24.parquet/*.parquet")

## 2.4 Address class imbalance with SMOTE

In [0]:
import random
from functools import reduce
from pyspark.sql import Row
from pyspark.sql.functions import rand,col,when,concat,substring,lit,udf,lower,sum as ps_sum,count as ps_count,row_number
from pyspark.sql.window import *
from pyspark.sql import DataFrame
from pyspark.ml.feature import VectorAssembler,BucketedRandomProjectionLSH,VectorSlicer
from pyspark.ml.linalg import Vectors,VectorUDT
from pyspark.sql.functions import array, create_map, struct

In [0]:
# SMOTE processor, adapted from: https://medium.com/@haoyunlai/smote-implementation-in-pyspark-76ec4ffa2f1d

############################## spark smote oversampling ##########################
#for categorical columns, must take its stringIndexed form (smote should be after string indexing, default by frequency)



def smote(vectorized_sdf, multiplier, bucket_length, k=5, seed=7):
    all_categoricals = ['month', 'day_of_month', 'day_of_week', 'op_carrier', 'dep_blk'] + (['equipment_delayed'] if run != 'basic' else [])
    group_categoricals = ['originIndex','destinationIndex']
    dataInput_min = vectorized_sdf[vectorized_sdf['label'] == 1]
    #dataInput_maj = vectorized_sdf[vectorized_sdf['label'] == 0]
    
    # drop the original features, we will have to reassemble them later
    dataInput_min = dataInput_min.drop("features")

    # LSH, bucketed random projection
    brp = BucketedRandomProjectionLSH(inputCol="hash_vector", outputCol="hashes",seed=seed, bucketLength=bucket_length)
    # smote only applies on existing minority instances    
    model = brp.fit(dataInput_min)
    
    # here distance is calculated from brp's param inputCol
    # but only for pairs with the same group_categoricals (i.e. origin, dest, month etc. see 2.3 above)
    pairs_raw = model.approxSimilarityJoin(dataInput_min, dataInput_min, 3.0, distCol="distance") \
      .filter(" and ".join([f"(datasetA.{c} == datasetB.{c})" for c in group_categoricals]))

    # remove self-comparison (distance 0)
    pairs = pairs_raw.filter(pairs_raw.distance > 0)
    
    rows = Window \
      .partitionBy("datasetA")
    
    rows_ordered = rows \
      .orderBy("distance")

    pairs_numbered = pairs \
      .withColumn("r_num", f.row_number().over(rows_ordered))

    # filter by filter_categoricals
    
    knn = pairs_numbered \
      .filter(pairs_numbered.r_num <= k)

    #print("original:",dataInput_min.count())
    #print("set of 5 nearest neighbors: ", knn.count())
    # list to store batches of synthetic data
    result = []
    
    # udf for vector interpolation includes a random factor [0,1]
    random_interpolate = f.udf(lambda arr: arr[0]+random.uniform(0, 1)*(arr[0]-arr[1]), VectorUDT())
    
    # retain original columns
    original_cols = dataInput_min.columns
    
    for i in range(multiplier):
        print(f"generating batch {i} of synthetic instances")
        # logic to randomly select neighbor: pick the largest random number generated row as the neighbour
        random_sample = knn \
                            .withColumn("rand", f.rand()) \
                            .withColumn('max_rand', f.max('rand').over(rows)) \
                            .where(f.col('rand') == f.col('max_rand')) \
                            .drop('max_rand','rand','r_num')
        
        # create synthetic feature numerical part *** only from 'numericals'
        synth = random_sample.select('*', random_interpolate(f.array('datasetA.numeric_Scaled', 'datasetB.numeric_Scaled')).alias('numeric_Scaled'))
        
        # for categorical cols, either pick original or the neighbor's categorical values
        choice = random.choice(['datasetA','datasetB'])
        for c in all_categoricals:
          synth = synth.withColumn(f"{c}classVec",f.col(f"{choice}.{c}classVec"))
          
        # add label
        synth = synth.withColumn("label",f.col("datasetA.label"))
                  
        # then drop the extra columns
        synth = synth.drop('datasetA','datasetB')
        
        # and add to the list for later
        result.append(synth)
        # debug print("  count:", synth.count())
    
    # bring all the new synthetic samples together
    dfunion = reduce(DataFrame.unionAll, result)
    
    # union synthetic instances with original full (both minority and majority) df
    result_df = dfunion.union(vectorized_sdf.select(dfunion.columns))
    # merge the separate feature components back together
    assembler = VectorAssembler(inputCols=[c + "classVec" for c in all_categoricals] + ["numeric_scaled"], outputCol="features")
    assembled_result_df = assembler.transform(result_df)
    
    return assembled_result_df
    


In [0]:
# if SMOTE doesn't work, then ....

def oversample_minority_class(vectorized_sdf, multiplier):
    dataInput_min = vectorized_sdf[vectorized_sdf['label'] == 1]
    
    dfunion = reduce(DataFrame.unionAll, [dataInput_min for i in range(multiplier)])
    result = dfunion.union(vectorized_sdf.select(dfunion.columns))
    
    return result
  

In [0]:
# Establish some important counts


train_delayed = train_class_counts[0]['count']
train_not_delayed = train_class_counts[1]['count']
print('delayed:', train_delayed)
print('not delayed:', train_not_delayed)


In [0]:
# since we are grouping nearest neighbors restricted to same origin, destination and month, how many samples are in those buckets? 
# display(train.filter("label==1").groupBy(['origin','destination','month']).count())

In [0]:

if SMOTE:
  # the actual SMOTE

  multiplier = int((train_not_delayed - train_delayed) / train_delayed)
  multiplier = 6
  bucket_length = 2

  train_full = smote(train, multiplier, bucket_length)
  print('SMOTE, multiplier:',multiplier, "bucket length:", bucket_length)

else:
  # primitive oversampling of the minority class
  multiplier = 2
  train_full = oversample_minority_class(train, multiplier)
  print('Oversampled, multiplier:',multiplier)

    
# an action to make it all happen
display(train_full.groupBy('label').count())


label,count
1.0,9171351
0.0,17352650


In [0]:
if SMOTE:
  train_full.write.mode("overwrite").format("parquet").save(f"{project_path}/train_smote_{suffix}.parquet")
else:
  train_full.write.mode("overwrite").format("parquet").save(f"{project_path}/train_oversampled_{suffix}.parquet")

In [0]:
print("done")

In [0]:
project_path = "dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/"
display(dbutils.fs.ls(project_path))

path,name,size
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/Chitra_feature_data/,Chitra_feature_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airline_singleday.parquet/,airline_singleday.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_features_ext.parquet/,airlines_3m_features_ext.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_3m_full_features.parquet/,airlines_3m_full_features.parquet/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airlines_weather_data/,airlines_weather_data/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport-timezones.csv,airport-timezones.csv,439779
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges/,airport_edges/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_1_year/,airport_edges_1_year/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_3_month/,airport_edges_3_month/,0
dbfs:/user/chitra.agastya@ischool.berkeley.edu/FinalProject/airport_edges_4_year/,airport_edges_4_year/,0
